In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.plotting import scatter_matrix

from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion,Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV , RandomizedSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
housing = pd.read_csv("/kaggle/input/california-housing-prices/housing.csv")
housing.head()

In [None]:
housing.describe()

In [None]:
import matplotlib.pyplot as plt
#plt.figure(size=(20,18))
housing.hist()

plt.show()

In [None]:
strat_train_data, strat_test_data = train_test_split(housing, test_size = 0.2, random_state= 42)

In [None]:
housing = strat_train_data.copy()

In [None]:
corr = housing.corr()
corr["median_house_value"].sort_values(ascending=False)

In [None]:
housing.plot(kind="scatter", x="longitude", y = "latitude", alpha=.05)

In [None]:
sct_features = ["median_house_value", "median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[sct_features],figsize=(12,8))

In [None]:
housing["room_per_household"] = housing["total_rooms"]/ housing["households"]
housing["bedroom_per_rooms"] = housing["total_bedrooms"]/ housing["total_rooms"]
housing["population_per_household"] = housing["population"]/ housing["households"]

In [None]:
corr = housing.corr()
corr["median_house_value"].sort_values(ascending=False)

Now we have added 3 more derive features and can see that bedroom_per_rooms is also contributing a promising feature.
So we got our data. Now we will create function for cleaing data, filling data, adding derived features and data transformation.

In [None]:
#first reset data to its original state
housing = strat_train_data.drop(["median_house_value"], axis= 1)
housing_label = strat_train_data["median_house_value"].copy()

In [None]:
#Create first Transformer FeatureSelector
#It will select numeric and categorical feature

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attr):
        self.attr = attr
    
    def fit(self, X, y= None):
        return self
    
    def transform(self, X, y=None):
        return X[self.attr].values

In [None]:
#Create another Transformer  CustomFeatureAdder
total_rooms, total_bedrooms , population, household = 3, 4, 5, 6

class CustomFeatureAdder(BaseEstimator,TransformerMixin):
    def __init__(self,add_bedroom_per_rooms):
        self.add_bedroom_per_rooms = add_bedroom_per_rooms
        
    def fit(self, X, y= None):
        return self
    
    def transform(self, X, y=None):
        rooms_per_household = X[:,total_rooms]/ X[:,household]
        population_per_household = X[:,population] / X[:,household]
        
        if self.add_bedroom_per_rooms:
            bedroom_per_rooms = X[:,total_bedrooms] / X[:,total_rooms]
            return np.c_[X,rooms_per_household, population_per_household, bedroom_per_rooms]
        else:
            return np.c_[X,rooms_per_household, population_per_household]

In [None]:
add_feature = CustomFeatureAdder(add_bedroom_per_rooms=True)
housing_with_customFeature = add_feature.transform(housing.values)
housingpd = pd.DataFrame(housing_with_customFeature, columns=list(housing.columns)+["rooms_per_household", "population_per_household","bedroom_per_rooms"],
    index=housing.index)
housingpd.head()

Now Create Pipeline consisting of impuer, custom Transformer, Normalization of Standerdization
We will create two pipeline first with number and second with category

In [None]:
num_attr = list(housing.drop("ocean_proximity",axis= 1))
cat_attr = ["ocean_proximity"]

In [None]:
numerical_pipeline = Pipeline([
    ("selector", FeatureSelector(num_attr)),
    ("imputer", SimpleImputer(strategy="median")),
    ("add_feature", CustomFeatureAdder(add_bedroom_per_rooms=True)),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("selector", FeatureSelector(cat_attr)),
    ("endcoder", OneHotEncoder(sparse=False))
])

In [None]:
#Merege both feature into one with FeatureUnion

combined_pipeline = FeatureUnion(transformer_list= [
    ("num_p", numerical_pipeline),
    ("cat_p", categorical_pipeline)
])

In [None]:
housing_transformed = combined_pipeline.fit_transform(housing)
housing_transformed


In [None]:
#Train and predict model
#LinearRegression
lr = LinearRegression()
lr.fit(housing_transformed,housing_label)
pred = lr.predict(housing_transformed)
mse = mean_squared_error(housing_label, pred)
rms = np.sqrt(mse)
rms

In [None]:
#Train and predict model
#DecisionTreeRegression
dtr = DecisionTreeRegressor(random_state = 42)
dtr.fit(housing_transformed,housing_label)
pred = dtr.predict(housing_transformed)
mse = mean_squared_error(housing_label, pred)
rms = np.sqrt(mse)
rms

Since Above rms score is zero. It is looking like model is overfitting. Now we will cross_val_score to verify model's behaviour.

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(dtr,housing_transformed,housing_label,scoring="neg_mean_squared_error", cv= 10)
tree_rmse = np.sqrt(-score)
print(tree_rmse.mean(),tree_rmse.std())

Cross_val_score is showing that model is overfitted and performed bad with cross validation. Now we will look for Random forest regressor

In [None]:
#prediction with random forest
rf = RandomForestRegressor()
score = cross_val_score(rf, housing_transformed, housing_label, scoring='neg_mean_squared_error', cv= 10)
#rf.fit(housing_transformed, housing_label)
#pred = rf.predict(housing_transformed)
#score = mean_squared_error(housing_label, pred)
rmse = np.sqrt(-score)
print(rmse.mean(),rmse.std())

In [None]:
#Now will use gridSearchCV for hyperparameter tuning
grid_param = [
    {'n_estimators': [3,10,30,40], 'max_features' : [2,4,6,8,10]},
    {'bootstrap' : [False],'n_estimators': [3,10,30], 'max_features' : [2,4,6,8]}
]

rf = RandomForestRegressor()
grid = GridSearchCV(rf, grid_param, cv= 5, scoring= 'neg_mean_squared_error')
grid.fit(housing_transformed,housing_label)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)
cv_result = grid.cv_results_
for mean_score, params in zip(cv_result["mean_test_score"], cv_result["params"]):
    print(np.sqrt(-mean_score),params)


In [None]:
print(categorical_pipeline.named_steps["endcoder"].get_feature_names())

In [None]:
important_feature = grid.best_estimator_.feature_importances_
print(important_feature)
extra_attr = ["room_per_hh", "pop_per_hh", "bd_per_rm"]
cat_val = categorical_pipeline.named_steps["endcoder"].get_feature_names()
cat_attr = list(cat_val)
#print(cat_attr)
total_attr = num_attr + extra_attr + cat_attr

sorted(zip(important_feature,total_attr), reverse= True)
