In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from preprocessing import preprocessing

from sklearn.metrics import mean_squared_error

In [2]:
# import data

housing = pd.read_csv("files/housing.csv")
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [3]:
# train test split

housing_train, housing_test = train_test_split(housing, test_size= 0.2, stratify= housing["ocean_proximity"], random_state= 42)

In [6]:
# target and feature dataset

y_train = housing_train["median_house_value"]
X_train = housing_train.drop("median_house_value", axis=1)

In [7]:
# looking for new complex/powerful model

# from sklearn.ensemble import RandomForestRegressor

rnd_forest = Pipeline([
    ("preprocessing", preprocessing),
    ("randomforest", RandomForestRegressor(random_state= 42))
],
# memory= 
)

In [8]:
%%time
rnd_forest.fit(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)


CPU times: total: 1min 17s
Wall time: 1min 27s


In [11]:
# Fine Tune the model 

# from sklearn.model_selection import GridSearchCV

# 13 combinations
param_grid = [
    {
    "preprocessing__geo__n_clusters" : [5, 8, 10],    #3*3 = 9
    "randomforest__max_features" : [4,6,8]
    },
    
    {
    "preprocessing__geo__n_clusters" : [12,15],    #2*2 = 4
    "randomforest__max_features" : [8,10]
    }
]

In [12]:
grid_search = GridSearchCV(
    rnd_forest,
    param_grid,
    scoring= "neg_root_mean_squared_error",
    cv = 3,
#     refit= True,
    
)

In [15]:
%%time

# hyperparameter tunning, parameter tunning  --> fine tune the model
grid_search.fit(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


CPU times: total: 9min 59s
Wall time: 10min 48s


In [16]:
# grid score

grid_search.cv_results_

{'mean_fit_time': array([ 9.74567246, 13.11986423, 16.74664227,  9.52694861, 13.52523239,
        17.49796629,  9.65554436, 14.02453748, 18.01770345, 18.31400188,
        22.79093981, 18.74310939, 22.99604464]),
 'std_fit_time': array([0.24576821, 0.11002032, 0.15191482, 0.04637894, 0.16710557,
        0.03064849, 0.03258354, 0.25362632, 0.01094994, 0.0140682 ,
        0.21271723, 0.05458229, 0.07339288]),
 'mean_score_time': array([0.16323113, 0.12640723, 0.12833746, 0.1292932 , 0.12866195,
        0.12753248, 0.12795162, 0.12797228, 0.12680403, 0.12581062,
        0.13159609, 0.12858335, 0.12817812]),
 'std_score_time': array([0.04117413, 0.00088211, 0.00594457, 0.00441156, 0.00332189,
        0.00380969, 0.00416031, 0.0023018 , 0.00142099, 0.00360645,
        0.00310529, 0.00255235, 0.00107137]),
 'param_preprocessing__geo__n_clusters': masked_array(data=[5, 5, 5, 8, 8, 8, 10, 10, 10, 12, 12, 15, 15],
              mask=[False, False, False, False, False, False, False, False,
      

In [17]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_randomforest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,9.745672,0.245768,0.163231,0.041174,5,4,"{'preprocessing__geo__n_clusters': 5, 'randomf...",-46917.676439,-46946.512555,-48552.19815,-47472.129048,763.814912,12
1,13.119864,0.11002,0.126407,0.000882,5,6,"{'preprocessing__geo__n_clusters': 5, 'randomf...",-47178.392368,-46911.218133,-48258.9777,-47449.5294,582.666531,11
2,16.746642,0.151915,0.128337,0.005945,5,8,"{'preprocessing__geo__n_clusters': 5, 'randomf...",-47887.056663,-47620.044622,-48915.504541,-48140.868609,558.491706,13
3,9.526949,0.046379,0.129293,0.004412,8,4,"{'preprocessing__geo__n_clusters': 8, 'randomf...",-45616.341081,-44841.572836,-46832.758477,-45763.557465,819.536276,6
4,13.525232,0.167106,0.128662,0.003322,8,6,"{'preprocessing__geo__n_clusters': 8, 'randomf...",-45624.974572,-45203.486363,-46624.212041,-45817.557659,595.780467,7
5,17.497966,0.030648,0.127532,0.00381,8,8,"{'preprocessing__geo__n_clusters': 8, 'randomf...",-46172.571529,-45992.214909,-47317.15468,-46493.980373,586.710651,10
6,9.655544,0.032584,0.127952,0.00416,10,4,"{'preprocessing__geo__n_clusters': 10, 'random...",-44773.369954,-44003.131309,-46341.327927,-45039.276397,972.906482,2
7,14.024537,0.253626,0.127972,0.002302,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-44969.849649,-44448.503306,-46421.713336,-45280.022097,834.883092,5
8,18.017703,0.01095,0.126804,0.001421,10,8,"{'preprocessing__geo__n_clusters': 10, 'random...",-45648.26902,-45329.165273,-46898.849117,-45958.761137,677.387559,9
9,18.314002,0.014068,0.125811,0.003606,12,8,"{'preprocessing__geo__n_clusters': 12, 'random...",-44843.165851,-44440.886655,-46121.885411,-45135.312639,716.682875,3


In [18]:
# best paarams

grid_search.best_params_

{'preprocessing__geo__n_clusters': 15, 'randomforest__max_features': 8}

In [19]:
# best estimtors

grid_search.best_estimator_

In [21]:
# final model

final_model = grid_search.best_estimator_

In [22]:
help(Pipeline)

Help on class Pipeline in module sklearn.pipeline:

class Pipeline(sklearn.utils.metaestimators._BaseComposition)
 |  Pipeline(steps, *, memory=None, verbose=False)
 |  
 |  Pipeline of transforms with a final estimator.
 |  
 |  Sequentially apply a list of transforms and a final estimator.
 |  Intermediate steps of the pipeline must be 'transforms', that is, they
 |  must implement `fit` and `transform` methods.
 |  The final estimator only needs to implement `fit`.
 |  The transformers in the pipeline can be cached using ``memory`` argument.
 |  
 |  The purpose of the pipeline is to assemble several steps that can be
 |  cross-validated together while setting different parameters. For this, it
 |  enables setting parameters of the various steps using their names and the
 |  parameter name separated by a `'__'`, as in the example below. A step's
 |  estimator may be replaced entirely by setting the parameter with its name
 |  to another estimator, or a transformer removed by setting