In [1]:
%%time


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from preprocessing import preprocessing

from sklearn.metrics import mean_squared_error

# import data

housing = pd.read_csv("files/housing.csv")
housing.columns

# train test split

housing_train, housing_test = train_test_split(housing, test_size= 0.2, stratify= housing["ocean_proximity"], random_state= 42)

# target and feature dataset

y_train = housing_train["median_house_value"]
X_train = housing_train.drop("median_house_value", axis=1)

# looking for new complex/powerful model

# from sklearn.ensemble import RandomForestRegressor

rnd_forest = Pipeline([
    ("preprocessing", preprocessing),
    ("randomforest", RandomForestRegressor(random_state= 42))
],
# memory= 
)

# %%time
rnd_forest.fit(X_train, y_train)

# lin_reg -->  68000,   NA
# tree --> 0.0 ,  67000
# forest --> 17000,  47000

  super()._check_params_vs_input(X, default_n_init=10)


CPU times: total: 1min 5s
Wall time: 1min 15s


### grid Search -- fine for fewer combinatons, for higher combination it is computationally expensive

## Randomized Search CV --  for higher parametr search space

In [2]:
from scipy.stats import randint

param_distribs = {
    "preprocessing__geo__n_clusters" : randint(low= 3, high= 50) ,   
    "randomforest__max_features" : randint(low=2, high = 20)
    }

In [3]:
randint(low= 3, high= 50)

<scipy.stats._distn_infrastructure.rv_discrete_frozen at 0x1e47680a750>

In [4]:
from sklearn.model_selection import RandomizedSearchCV

rnd_search = RandomizedSearchCV(
    rnd_forest,
    param_distributions= param_distribs,
    n_iter= 10,
    cv = 3,
    scoring= "neg_root_mean_squared_error",
    random_state= 42,
    refit= True
)

In [5]:
%%time
rnd_search.fit(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


CPU times: total: 8min 45s
Wall time: 9min 38s


In [6]:
rnd_search.best_params_

{'preprocessing__geo__n_clusters': 45, 'randomforest__max_features': 9}

In [7]:
rnd_search.cv_results_

{'mean_fit_time': array([36.51776695, 21.15816204, 17.94288111, 26.16334502, 10.88876462,
        10.04418429,  7.47594595, 28.78998764, 16.29181918,  5.13750879]),
 'std_fit_time': array([0.0879856 , 0.03336598, 0.1233689 , 0.14960318, 0.03574447,
        0.02049133, 0.04536164, 0.1515989 , 0.05885948, 0.71347334]),
 'mean_score_time': array([0.13212633, 0.13326971, 0.1354332 , 0.13988868, 0.12970797,
        0.14557759, 0.13837186, 0.13411283, 0.13486433, 0.13290707]),
 'std_score_time': array([0.00327601, 0.01485794, 0.00379048, 0.00582676, 0.00306085,
        0.00729426, 0.00640228, 0.00259057, 0.00278987, 0.00719646]),
 'param_preprocessing__geo__n_clusters': masked_array(data=[41, 45, 23, 21, 13, 42, 24, 26, 32, 4],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_randomforest__max_features': masked_array(data=[16, 9, 8, 12, 5, 4, 3, 13, 7, 2],
              m

In [8]:
pd.DataFrame(rnd_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_randomforest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,36.517767,0.087986,0.132126,0.003276,41,16,"{'preprocessing__geo__n_clusters': 41, 'random...",-43882.840412,-42786.34972,-44645.796818,-43771.662316,763.175953,6
1,21.158162,0.033366,0.13327,0.014858,45,9,"{'preprocessing__geo__n_clusters': 45, 'random...",-42527.850297,-41244.685762,-43918.711374,-42563.749144,1091.961473,1
2,17.942881,0.123369,0.135433,0.00379,23,8,"{'preprocessing__geo__n_clusters': 23, 'random...",-43523.601279,-42672.409144,-44868.028695,-43688.01304,903.86567,5
3,26.163345,0.149603,0.139889,0.005827,21,12,"{'preprocessing__geo__n_clusters': 21, 'random...",-44686.473688,-43775.442343,-45932.734005,-44798.216679,884.247967,9
4,10.888765,0.035744,0.129708,0.003061,13,5,"{'preprocessing__geo__n_clusters': 13, 'random...",-43966.190649,-43409.019451,-45552.306923,-44309.172341,907.982242,7
5,10.044184,0.020491,0.145578,0.007294,42,4,"{'preprocessing__geo__n_clusters': 42, 'random...",-42903.642386,-42081.781035,-44829.925899,-43271.783107,1151.729349,3
6,7.475946,0.045362,0.138372,0.006402,24,3,"{'preprocessing__geo__n_clusters': 24, 'random...",-43458.090713,-42481.477149,-45025.886733,-43655.151532,1048.055265,4
7,28.789988,0.151599,0.134113,0.002591,26,13,"{'preprocessing__geo__n_clusters': 26, 'random...",-44137.724574,-43363.297405,-45516.180406,-44339.067461,890.367161,8
8,16.291819,0.058859,0.134864,0.00279,32,7,"{'preprocessing__geo__n_clusters': 32, 'random...",-43020.723569,-41982.645573,-44419.003143,-43140.790761,998.255695,2
9,5.137509,0.713473,0.132907,0.007196,4,2,"{'preprocessing__geo__n_clusters': 4, 'randomf...",-49578.660984,-49800.428581,-51813.090379,-50397.393315,1005.134767,10


# Final Model To Launch

In [9]:
final_model = rnd_search.best_estimator_

In [10]:
# Real Word Scanrion


In [11]:
X_train.ocean_proximity.value_counts()

ocean_proximity
<1H OCEAN     7309
INLAND        5241
NEAR OCEAN    2126
NEAR BAY      1832
ISLAND           4
Name: count, dtype: int64

In [12]:
df_test1 = X_train[X_train.ocean_proximity == "NEAR BAY"]
df_test1
y_df_test1 = y_train[X_train.ocean_proximity == "NEAR BAY"]

In [13]:
pred = final_model.predict(df_test1)
pred

array([497319.96, 368835.05, 242621.  , ..., 244060.  , 379228.  ,
       306291.01])

In [14]:
y_df_test1

2382     500001.0
15542    348700.0
4452     245500.0
3423     169900.0
19954    294100.0
           ...   
2972     323500.0
11670    338000.0
8940     238600.0
2816     389200.0
11253    310900.0
Name: median_house_value, Length: 1832, dtype: float64

In [15]:
mean_squared_error(y_df_test1, pred, squared=False)

17732.722363782545

In [16]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
2931,-121.88,38.00,22.0,721.0,117.0,367.0,129.0,5.3098,INLAND
4175,-117.82,33.85,18.0,1810.0,305.0,1189.0,326.0,5.2227,<1H OCEAN
12703,-118.31,34.09,34.0,2065.0,839.0,2626.0,775.0,1.8214,<1H OCEAN
16281,-120.76,38.60,14.0,2925.0,625.0,1226.0,437.0,2.5865,INLAND
12420,-118.09,33.92,36.0,847.0,185.0,713.0,194.0,4.8542,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
3666,-118.21,34.13,52.0,2465.0,611.0,1433.0,570.0,3.2500,<1H OCEAN
17452,-122.88,38.34,20.0,3404.0,628.0,1641.0,585.0,5.0574,<1H OCEAN
3006,-118.03,33.90,36.0,1143.0,193.0,826.0,188.0,5.3184,<1H OCEAN
3297,-118.36,34.09,36.0,1390.0,458.0,874.0,468.0,2.5812,<1H OCEAN


In [17]:
# Evaluate Your Final Model/ML system on test set

In [18]:
X_test = housing_test.drop("median_house_value", axis=1)
y_test = housing_test.median_house_value

In [19]:
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
7050,-121.81,38.01,47.0,1942.0,430.0,1074.0,393.0,2.2361,INLAND
18040,-116.05,33.33,17.0,290.0,94.0,135.0,57.0,1.7292,INLAND
19045,-119.76,36.71,29.0,1745.0,441.0,1530.0,391.0,1.5611,INLAND
2724,-122.25,37.80,42.0,4120.0,1065.0,1715.0,1015.0,2.9345,NEAR BAY
464,-121.98,38.34,18.0,3876.0,916.0,2386.0,867.0,2.5938,INLAND
...,...,...,...,...,...,...,...,...,...
11702,-124.17,41.76,20.0,2673.0,538.0,1282.0,514.0,2.4605,NEAR OCEAN
20128,-122.06,36.98,15.0,3385.0,669.0,1571.0,615.0,4.2254,NEAR OCEAN
16143,-122.31,37.94,38.0,2172.0,403.0,945.0,384.0,4.3958,NEAR BAY
9591,-122.47,37.69,30.0,837.0,213.0,606.0,199.0,4.8750,NEAR OCEAN


In [20]:
y_test

7050     105100.0
18040     81300.0
19045     44400.0
2724     225000.0
464      129500.0
           ...   
11702    105900.0
20128    320900.0
16143    194200.0
9591     258800.0
18688    325600.0
Name: median_house_value, Length: 4128, dtype: float64

In [21]:
y_pred_test = final_model.predict(X_test)
y_pred_test

array([113232.  ,  64942.  ,  51921.  , ..., 189774.  , 265435.  ,
       303276.02])

In [22]:
# evaluate

In [23]:
final_rmse = mean_squared_error(y_true= y_test, y_pred= y_pred_test , squared= False)
final_rmse

40728.68702174712

# Aproved for launch

## some system improvment approches
1. specific error analyse
2. add extra feature
3. remove uninformative atttribue 
4. outlier, round, cap


In [28]:
# feature improtence:
 
fi = final_model["randomforest"].feature_importances_
fi

array([6.76320216e-02, 5.39686176e-02, 4.69953006e-02, 5.89021169e-03,
       6.91094202e-03, 6.65295994e-03, 6.34116822e-03, 1.79898285e-01,
       5.71207183e-03, 9.17630612e-03, 1.09555924e-02, 1.89907102e-02,
       8.67658378e-03, 9.06207982e-03, 1.69379709e-02, 1.01244390e-02,
       1.22062570e-02, 1.77967053e-02, 1.19258346e-02, 6.98753245e-03,
       6.52568980e-03, 4.12324851e-03, 1.39062463e-02, 7.12379928e-03,
       2.55412041e-02, 4.20001606e-03, 9.31392296e-03, 7.78753843e-03,
       5.34988592e-03, 6.26047749e-03, 8.55927808e-03, 1.44928544e-02,
       1.54996104e-02, 1.83488124e-02, 1.43640027e-02, 9.55352852e-03,
       1.20735613e-02, 6.67291227e-03, 1.31795964e-02, 1.02413896e-02,
       8.62381999e-03, 6.84301536e-03, 1.06615537e-02, 5.25009297e-03,
       1.88474100e-02, 6.00266176e-03, 1.89985969e-02, 1.65321814e-02,
       1.30597992e-02, 1.01133126e-02, 1.13106713e-02, 1.73320650e-02,
       4.00355088e-02, 2.10220477e-03, 8.07879761e-02, 1.04619976e-05,
      

In [26]:
len(final_model["randomforest"].feature_importances_)

59

In [27]:
rnd_search.best_params_

{'preprocessing__geo__n_clusters': 45, 'randomforest__max_features': 9}

In [29]:
sorted(zip(fi, final_model["preprocessing"].get_feature_names_out()), reverse = True)

[(0.1798982848817322, 'log__median_income'),
 (0.08078797611790621, 'cat__ocean_proximity_INLAND'),
 (0.06763202161064677, 'bedrooms__ratio'),
 (0.053968617569761054, 'rooms_per_house__ratio'),
 (0.04699530064167568, 'people_per_house__ratio'),
 (0.04003550880690142, 'geo__similarity with 45 cluster'),
 (0.02554120412165464, 'geo__similarity with 17 cluster'),
 (0.01899859692781792, 'geo__similarity with 39 cluster'),
 (0.018990710170044132, 'geo__similarity with 4 cluster'),
 (0.01884741004215697, 'geo__similarity with 37 cluster'),
 (0.018348812393772437, 'geo__similarity with 26 cluster'),
 (0.01779670529416739, 'geo__similarity with 10 cluster'),
 (0.017332065049168354, 'geo__similarity with 44 cluster'),
 (0.016937970930815487, 'geo__similarity with 7 cluster'),
 (0.01653218138832321, 'geo__similarity with 40 cluster'),
 (0.015499610351593746, 'geo__similarity with 25 cluster'),
 (0.014492854389366686, 'geo__similarity with 24 cluster'),
 (0.014364002734117526, 'geo__similarity wi

# launch, Monitor and Maintain our system

In [30]:
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [32]:
# deployment

# save the model

import joblib

In [33]:
joblib.dump(final_model,"HouseXprice.pkl")

['HouseXprice.pkl']