In [None]:
# % % markdown
# # # Index
# % % markdown
#[1. Correlation Matrix](#corr) < br >
  #[2. Stratified Shuffle Split](#split) < br >
  #[3. Plotting the Data](#plot) < br >
  #[4. Pipeline Implementation](#pipeline) < br >
  #[5. Column Tranformer](#column_transformer) < br >
  #[6. Random Forest Regressor](#random_forest) < br >
  # $\;\;\;\;\;\;
$[6.1 Train Set Prediction](#train_predict) < br >
  # $\;\;\;\;\;\;
$[6.2 Test Set Prediction](#test_predict) < br > < br >
  # % % markdown
# < blockquote > < b > Yasin İnal < /b>
# These are the notes that I 've taken from the book '
Hands - On Machine Learning with Scikit - Learn, Keras & Tensorflow ' by Aurelion Geron </blockquote>

# % % codecell
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

HOUSING_PATH = os.path.join("datasets", "housing")

def fetch_housing_data(housing_url = HOUSING_URL, housing_path = HOUSING_PATH):
  os.makedirs(housing_path, exist_ok = True)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path = housing_path)
housing_tgz.close()

# % % codecell
import pandas as pd

def load_housing_data(housing_path = HOUSING_PATH):
  csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)

# % % codecell
fetch_housing_data()
housing = load_housing_data()
housing.head()
# % % markdown
# < a id = "corr" > < /a>
# # # # # Correlation Matrix


# % % codecell
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending = False)
# % % markdown
# < a id = "plot" > < /a>
# # # # Plotting the Data

# % % codecell %
  matplotlib inline
import matplotlib.pyplot as plt

housing.plot(kind = "scatter", x = "longitude", y = "latitude", alpha = 0.4,
  s = housing["population"] / 100, label = "population", figsize = (10, 7),
  c = "median_house_value", cmap = plt.get_cmap("jet"), colorbar = True, )

plt.legend()
# % % markdown
# < a id = "split" > < /a>
# # # # Stratified Shuffle Split

# % % codecell
import numpy as np

housing["income_cat"] = pd.cut(housing["median_income"],
  bins = [0., 1.5, 3., 4.5, 6., np.inf],
  labels = [1, 2, 3, 4, 5])

# % % codecell
from sklearn.model_selection
import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
  strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]

# % % codecell
strat_test_distribute = strat_test_set["income_cat"].value_counts() / len(strat_test_set)
strat_train_distribute = strat_train_set["income_cat"].value_counts() / len(strat_train_set)
strat_test_distribute, strat_train_distribute
# % % markdown
# < code > The data of each train and data set are distributed very fairly and equally.

# % % codecell
for set_ in (strat_train_set, strat_test_set):
  set_.drop("income_cat", axis = 1, inplace = True)

# % % codecell
train_feed = strat_train_set.drop("median_house_value", axis = 1)
test_feed = strat_test_set.drop("median_house_value", axis = 1)

# % % codecell
train_labels = strat_train_set["median_house_value"].copy()
test_labels = strat_test_set["median_house_value"].copy()

# % % codecell
len(train_feed), len(test_feed)
# % % markdown
# < a id = "pipeline" > < /a>
# # # # Pipeline

# % % codecell
from sklearn.pipeline
import Pipeline
from sklearn.preprocessing
import StandardScaler
from sklearn.impute
import SimpleImputer

num_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy = "median")),
  ('std_scaler', StandardScaler())
])


# % % codecell
train_feed['ocean_proximity']
# % % markdown
# < blockquote > Ocean Proximity is not a numerical feature as you can see. < br >
  # I will transform it by using OneHotEncoder < /blockquote>

# % % codecell
housing_num = train_feed.drop("ocean_proximity", axis = 1)
# % % markdown
# < a id = "column_transformer" > < /a>
# # # # Column Transformer

# % % codecell
from sklearn.compose
import ColumnTransformer
from sklearn.preprocessing
import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
  ("num", num_pipeline, num_attribs),
  ("cat", OneHotEncoder(), cat_attribs)
])

train_prepared = full_pipeline.fit_transform(train_feed)
test_prepared = full_pipeline.fit_transform(test_feed)
# % % markdown
# < a id = "random_forest" > < /a>
# # # # Random Forest Regressor

# % % codecell
from sklearn.ensemble
import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(train_prepared, train_labels)
housing_predictions_train = forest_reg.predict(train_prepared)

# % % codecell
from sklearn.metrics
import mean_squared_error
from sklearn.model_selection
import cross_val_score

forest_scores_tr = cross_val_score(forest_reg, train_prepared, train_labels, scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores_tr = np.sqrt(-forest_scores)

# % % codecell
def display_scores(scores):
  print("Mean:", scores.mean())
print("Standard deviation:", scores.std())

# % % codecell
display_scores(forest_rmse_scores_tr)
# % % markdown
# < a id = "train_predict" > < /a>
# # # Train Set Prediction

# % % codecell
display_scores(forest_rmse_scores), forest_rmse
# % % markdown
# < a id = "test_predict" > < /a>
# # # Test Set Prediction

# % % codecell
housing_predictions_test = forest_reg.predict(test_prepared)

# % % codecell
forest_rmse = mean_squared_error(test_labels, housing_predictions_test, squared = False)
forest_scores = cross_val_score(forest_reg, test_prepared, test_labels, scoring = "neg_mean_squared_error", cv = 10)
forest_rmse_scores = np.sqrt(-forest_scores)

# % % codecell
display_scores(forest_rmse_scores), forest_rmse
# % % markdown
# < br >
  # < blockquote > To understand the results better,
  let 's examine the <code>median_house_value</code> in detail.

# % % codecell
housing['median_house_value'].describe()
# % % markdown
# < blockquote > Since the standard deviation of the 'median_house_value'
is < code > 115, 395 < /code> , the prediction error of <code>53,000</code > is moderately good.

# % % codecell
#
import dill
# dill.dump_session('housing.db')
# dill.load_session('housing.db')

# % % codecell