## Note

#### This notebook has been used by me to follow along the chapter 2 of the book ['Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow, 2nd Edition' by Aurélien Géron](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/)

#### The original notebook may be found [here](https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb)

#### This notebook has only been made to understand and internalize the content in the book. A few variable names or methods may be different.

#### With that out of the way, I took quite a while to understand what was going on, but now I can safely say that I do. Boy this was a fun journey.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Reading the data

In [None]:
housing = pd.read_csv('../input/california-housing-data/housing.csv')
housing.head()

## Manual inspection

In [None]:
housing.info()

In [None]:
housing['ocean_proximity'].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50,figsize=(20,15))
plt.show()

## Preparing data for passing to model(s)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

#may introduce sampling bias

In [None]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

housing["income_cat"].hist().grid(False)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

## More visualisations

In [None]:
copy_set = strat_train_set.copy()

In [None]:
copy_set.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)
plt.show()

In [None]:
copy_set.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,s=copy_set['population']/100,label='Population',colorbar=True
              , c=copy_set['median_house_value'], cmap='jet',figsize=(10,8))

plt.show()

In [None]:
copy_set.corr()['median_house_value'][:-1]

In [None]:
import seaborn as sns

plt.figure(figsize=(10,8))
sns.heatmap(copy_set.corr(),annot=True,mask=np.triu(np.ones_like(copy_set.corr())))
plt.show()

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(15, 10))
plt.show()

In [None]:
copy_set.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.05)
plt.show()

In [None]:
copy_set["rooms_per_household"] = copy_set["total_rooms"]/copy_set["households"]
copy_set["bedrooms_per_room"] = copy_set["total_bedrooms"]/copy_set["total_rooms"]
copy_set["population_per_household"]=copy_set["population"]/copy_set["households"]
copy_set.corr()['median_house_value']

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
housing.info()

In [None]:
housing_num = housing.drop("ocean_proximity", axis=1)

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")


imputer.fit(housing_num)

X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing.index)

In [None]:
housing_cat = housing[["ocean_proximity"]]

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = pd.DataFrame(cat_encoder.fit_transform(housing_cat))
housing_cat_1hot.index = housing_cat.index
housing_cat_1hot
# pd.concat([housing_tr,housing_cat_1hot])

In [None]:
housing_tr

In [None]:
housing_tr.join(housing_cat_1hot)

In [None]:
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [housing.columns.get_loc(c) for c in col_names] 

from sklearn.base import BaseEstimator, TransformerMixin

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): 
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "population_per_household"],
    index=housing.index)
housing_extra_attribs.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared

## Finally, a model 

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)

In [None]:
pd.Series(forest_rmse_scores).describe()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=10, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = rnd_search.best_estimator_.feature_importances_
final_col_list = num_attribs+['rooms_per_hhold','pop_per_hhold','bedrooms_per_room']+list(cat_encoder.categories_[0])
sorted(zip(feature_importances, final_col_list), reverse=True)

In [None]:
from sklearn.metrics import mean_squared_error

final_model = rnd_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse