In [78]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from preprocessing import preprocessing

from sklearn.metrics import mean_squared_error



In [2]:
# import data

housing = pd.read_csv("files/housing.csv")
housing.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [3]:
# train test split

housing_train, housing_test = train_test_split(housing, test_size= 0.2, stratify= housing["ocean_proximity"], random_state= 42)

In [4]:
# target and feature dataset

y_train = housing_train["median_house_value"]
X_train = housing_train.drop("median_house_value", axis=1)

In [5]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
2931,-121.88,38.00,22.0,721.0,117.0,367.0,129.0,5.3098,INLAND
4175,-117.82,33.85,18.0,1810.0,305.0,1189.0,326.0,5.2227,<1H OCEAN
12703,-118.31,34.09,34.0,2065.0,839.0,2626.0,775.0,1.8214,<1H OCEAN
16281,-120.76,38.60,14.0,2925.0,625.0,1226.0,437.0,2.5865,INLAND
12420,-118.09,33.92,36.0,847.0,185.0,713.0,194.0,4.8542,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
3666,-118.21,34.13,52.0,2465.0,611.0,1433.0,570.0,3.2500,<1H OCEAN
17452,-122.88,38.34,20.0,3404.0,628.0,1641.0,585.0,5.0574,<1H OCEAN
3006,-118.03,33.90,36.0,1143.0,193.0,826.0,188.0,5.3184,<1H OCEAN
3297,-118.36,34.09,36.0,1390.0,458.0,874.0,468.0,2.5812,<1H OCEAN


In [6]:
y_train

2931     151900.0
4175     213500.0
12703    211100.0
16281    133800.0
12420    167400.0
           ...   
3666     214200.0
17452    276200.0
3006     171100.0
3297     200000.0
19932    244900.0
Name: median_house_value, Length: 16512, dtype: float64

In [7]:
lin_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("linearregression", LinearRegression())
])

In [8]:
lin_reg.fit(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
# rough idea about 

lin_reg.predict(X_train[:5])

array([253053.26431766, 240398.94118412, 139236.05570526,  92050.60467359,
       235655.86833388])

In [12]:
y_train[:5]

2931     151900.0
4175     213500.0
12703    211100.0
16281    133800.0
12420    167400.0
Name: median_house_value, dtype: float64

In [58]:
x = pd.DataFrame(np.array(X_train.iloc[10]).reshape(1,-1), columns= X_train.columns)
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-117.36,34.1,33.0,1904.0,343.0,1366.0,338.0,3.6227,INLAND


In [59]:
y_train.iloc[10]

92800.0

In [42]:
lin_reg.predict(x)

array([118364.67573624])

In [19]:
[X_train.iloc[10]]

[longitude            -117.36
 latitude                34.1
 housing_median_age      33.0
 total_rooms           1904.0
 total_bedrooms         343.0
 population            1366.0
 households             338.0
 median_income         3.6227
 ocean_proximity       INLAND
 Name: 12158, dtype: object]

In [20]:
type([X_train.iloc[10]])

list

In [21]:
len([X_train.iloc[10]])

1

In [39]:
x = pd.DataFrame(np.array(X_train.iloc[10]).reshape(1,-1), columns= X_train.columns)
x

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-117.36,34.1,33.0,1904.0,343.0,1366.0,338.0,3.6227,INLAND


In [44]:
# X_train_array = np.array(X_train)
# y_train_array = np.array(y_train)
# X_train_array.shape

(16512, 9)

In [46]:
# y_train_array.shape

(16512,)

In [48]:
# lin_reg_array = Pipeline([
#     ("preprocessing", preprocessing),
#     ("linearregression", LinearRegression())
# ])

In [49]:
# lin_reg_array.fit(X_train_array, y_train_array)

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [61]:
# Evalute the Model on Training Set

# from sklearn.metrics import mean_squared_error

predictions = lin_reg.predict(X_train)
predictions


array([253053.26431766, 240398.94118412, 139236.05570526, ...,
       237910.56470335, 238325.64501465, 306783.3705227 ])

In [62]:
y_train

2931     151900.0
4175     213500.0
12703    211100.0
16281    133800.0
12420    167400.0
           ...   
3666     214200.0
17452    276200.0
3006     171100.0
3297     200000.0
19932    244900.0
Name: median_house_value, Length: 16512, dtype: float64

In [63]:
lin_rmse = mean_squared_error(y_train, predictions, squared= False)
lin_rmse

68686.20679970192

In [64]:
y_train.describe()

count     16512.000000
mean     207443.464571
std      116229.200413
min       14999.000000
25%      119400.000000
50%      179500.000000
75%      266025.000000
max      500001.000000
Name: median_house_value, dtype: float64

> Model UnderFitting

1. data not informative
2. model not powerful

In [65]:
housing 

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [67]:
# using more complex model

# from sklearn.tree import DecisionTreeRegressor

tree = Pipeline([
    ("preprocessing", preprocessing),
    ("decisiontree", DecisionTreeRegressor())
])

In [68]:
tree.fit(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)


In [69]:
# performance measure of tree

y_pred_tree = tree.predict(X_train)
y_pred_tree

# tree_rmse = mean_squared_error(y_train, y_pred_tree, squared= False)

array([151900., 213500., 211100., ..., 171100., 200000., 244900.])

In [70]:
y_train

2931     151900.0
4175     213500.0
12703    211100.0
16281    133800.0
12420    167400.0
           ...   
3666     214200.0
17452    276200.0
3006     171100.0
3297     200000.0
19932    244900.0
Name: median_house_value, Length: 16512, dtype: float64

> Data
1. trend
2. Noice

In [71]:
tree_rmse = mean_squared_error(y_train, y_pred_tree, squared= False)
tree_rmse

0.0

> model Overfit

In [73]:
# better Evaluaton using cross-validation

# from sklearn.model_selection import cross_val_score

tree_rmses =  -cross_val_score(tree,
                            X_train,
                            y_train,
                            scoring= "neg_root_mean_squared_error",
                            cv = 10,)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [74]:
tree_rmses

array([68205.47035066, 67505.6966167 , 67181.79997961, 67968.6949277 ,
       71909.86192421, 65945.432096  , 68732.79269791, 67517.21233164,
       65421.30784825, 71957.77031841])

In [76]:
tree_rmses

array([68205.47035066, 67505.6966167 , 67181.79997961, 67968.6949277 ,
       71909.86192421, 65945.432096  , 68732.79269791, 67517.21233164,
       65421.30784825, 71957.77031841])

In [77]:
np.mean(tree_rmses)

68234.60390910893

In [88]:
# looking for new complex/powerful model

# from sklearn.ensemble import RandomForestRegressor

rnd_forest = Pipeline([
    ("preprocessing", preprocessing),
    ("randomforest", RandomForestRegressor(random_state= 42))
])



In [97]:
%%time
rnd_forest.fit(X_train, y_train)

  super()._check_params_vs_input(X, default_n_init=10)


CPU times: total: 1min 4s
Wall time: 1min 12s


In [100]:
rnd_pred = rnd_forest.predict(X_train)

In [99]:
y_train

2931     151900.0
4175     213500.0
12703    211100.0
16281    133800.0
12420    167400.0
           ...   
3666     214200.0
17452    276200.0
3006     171100.0
3297     200000.0
19932    244900.0
Name: median_house_value, Length: 16512, dtype: float64

In [102]:
forest_rmse = mean_squared_error(y_train, rnd_pred, squared= False)
forest_rmse

17767.15550411849

In [103]:
%%time
forest_cv_score = -cross_val_score(rnd_forest,
                                 X_train,
                                 y_train,
                                 cv = 10,
                                 scoring= "neg_root_mean_squared_error")

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


CPU times: total: 10min 9s
Wall time: 10min 45s


In [104]:
forest_cv_score

array([-47380.02612638, -47935.22827007, -48717.79271727, -46293.11172668,
       -47705.47018626, -48375.31460549, -49184.70227588, -46488.34012421,
       -45235.67389311, -51381.4358163 ])

In [105]:
np.mean(forest_cv_score)

-47869.709574164954

In [106]:
## Final Model :--> Random Forest Model

In [108]:
rnd_forest.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(remainder=Pipeline(steps=[('impute',
                                                SimpleImputer(strategy='median')),
                                               ('scale', StandardScaler())]),
                     transformers=[('bedrooms',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer(strategy='median')),
                                                    ('ratio',
                                                     FunctionTransformer(feature_names_out=<function column_name at 0x0000029F261493A0>,
                                                                         func=<function ratio at 0x0000029F26149260>)),
                                                    ('standa...
                                                     OneHotEncoder(sparse=False))]),
                                    ['ocean_proximity']),
   

# Evaluate Your Final Model/ML system on test set

In [1]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
one_hot = OneHotEncoder(sparse_output= False)

In [14]:
import pandas as pd
import numpy as np
housing = pd.read_csv("files/housing.csv")
arr = np.array(housing.ocean_proximity).reshape(-1,1)
clms = ["ocean_proximity"]
df = pd.DataFrame(arr, columns= clms)
df

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
...,...
20635,INLAND
20636,INLAND
20637,INLAND
20638,INLAND


In [15]:
one_hot.fit_transform(df)

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])