###  Predicting California Housing Prices using all the features

In [137]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor

##### Data Ingestion 

In [32]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv'
df = pd.read_csv(url)
df.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


### Feature Engineering

In [34]:
subset = df.loc[df['ocean_proximity'].isin(['<1H OCEAN','INLAND'])]
subset.reset_index(drop=True)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0,<1H OCEAN
1,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0,<1H OCEAN
2,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0,<1H OCEAN
3,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1H OCEAN
4,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15682,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
15683,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
15684,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
15685,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


### Preparing the dataset
###### Preparation:

######  Fill missing values with zeros.
######  Apply the log transform to median_house_value.
######  Do train/validation/test split with 60%/20%/20% distribution.
######  Use the train_test_split function and set the random_state parameter to 1.
######  Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [35]:
#check for duplicated values
subset.duplicated().sum()

0

In [36]:
#check for null values
subset.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [37]:
#fill the null value with zero
subset = subset.fillna(0)

In [38]:
subset.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [39]:
#apply log1p transformation to median_house_value
subset   

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0,<1H OCEAN
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0,<1H OCEAN
859,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1H OCEAN
861,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [42]:
subset.median_house_value = np.log1p(subset.median_house_value.values)
subset.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,2.637189,<1H OCEAN
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,2.586836,<1H OCEAN


In [44]:
#split the data into 60%-20%-20%
SEED = 1
df_full_train, df_test = train_test_split(subset, test_size=0.2, random_state=SEED) 
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=SEED)
len(subset), len(df_train),len(df_test),len(df_val), len(df_train+df_val+df_test)

(15687, 9411, 3138, 3138, 15687)

In [45]:
# separate the predicted values / responses
y_train = df_train.median_house_value
y_val = df_val.median_house_value
y_test = df_test.median_house_value

In [46]:
#remove the median_house_value values
del(df_train['median_house_value'])
del(df_val['median_house_value'])
del(df_test['median_house_value'])

In [51]:
# use DictVectorizer(sparse=True) to turn the dataframes into matrices.
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
full_dict = df_full_train.to_dict(orient='records')

In [54]:
dv = DictVectorizer(sparse=True)

In [112]:
X_train = dv.fit_transform(train_dict)
#X_val = dv.fit_transform(val_dict)
#X_full = dv.fit_transform(full_dict)

#### Question 1
###### Let's train a decision tree regressor to predict the median_house_value variable.

###### Train a model with max_depth=1.

In [138]:
dtr = DecisionTreeRegressor(max_depth=1)

In [139]:
dtr.fit(X_train, y_train)

In [115]:
print(export_text(dtr,feature_names=dv.get_feature_names_out()))

|--- ocean_proximity=INLAND <= 0.50
|   |--- value: [2.59]
|--- ocean_proximity=INLAND >  0.50
|   |--- value: [2.53]



#### Question 2
###### Train a random forest model with these parameters:

###### n_estimators=10
###### random_state=1
###### n_jobs=-1 (optional - to make training faster)
###### What's the RMSE of this model on validation?

In [140]:
dtr = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

In [141]:
dtr.fit(X_train, y_train)

In [118]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [119]:
y_pred = rfr.predict(X_val)
y_pred

array([2.58061477, 2.56914688, 2.52765688, ..., 2.55951132, 2.50124724,
       2.54185335])

In [120]:
score = mean_squared_error(y_test, y_pred, squared=False)
score.round(3)

0.059

#### Question 3

#### Now let's experiment with the n_estimators parameter
###### Try different values of this parameter from 10 to 200 with step 10.
###### Set random_state to 1.
###### Evaluate the model on the validation dataset.
###### After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for retrieving the answer.

In [126]:
def evaluate_rmse(n, max_depth=None, random_state=1):
    
    rfr = RandomForestRegressor(n_estimators=n, random_state=random_state, n_jobs=-1, max_depth=max_depth)
    rfr.fit(X_train, y_train)
    
    val_dicts = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    y_pred = rfr.predict(X_val)
    
    score = mean_squared_error(y_test, y_pred, squared=False).round(3)
    return score

In [128]:
for n in range(10, 200, 10):
    rmse = evaluate_rmse(n)
    print(f"At n: {n} the rmse: {rmse}")

At n: 10 the rmse: 0.059
At n: 20 the rmse: 0.059
At n: 30 the rmse: 0.059
At n: 40 the rmse: 0.059
At n: 50 the rmse: 0.059
At n: 60 the rmse: 0.059
At n: 70 the rmse: 0.059
At n: 80 the rmse: 0.059
At n: 90 the rmse: 0.059
At n: 100 the rmse: 0.059
At n: 110 the rmse: 0.059
At n: 120 the rmse: 0.059
At n: 130 the rmse: 0.059
At n: 140 the rmse: 0.059
At n: 150 the rmse: 0.059
At n: 160 the rmse: 0.059
At n: 170 the rmse: 0.059
At n: 180 the rmse: 0.059
At n: 190 the rmse: 0.059


#### Question 4
###### Let's select the best max_depth:
###### Try different values of max_depth: [10, 15, 20, 25]
###### For each of these values,
###### try different values of n_estimators from 10 till 200 (with step 10)
###### calculate the mean RMSE
###### Fix the random seed: random_state=1
###### What's the best max_depth, using the mean RMSE?

In [129]:
for m in [10, 15, 20, 25]:
    for n in range(10, 200, 10):
        rmse = evaluate_rmse(n, m)
        print(f"At m: {m} the rmse: {rmse}") 

At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 10 the rmse: 0.058
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 15 the rmse: 0.059
At m: 20 the rmse: 0.059
At m: 20 the rmse: 0.059


#### Question 5
###### We can extract feature importance information from tree-based models.

###### At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

###### In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

###### For this homework question, we'll find the most important feature:

###### Train the model with these parameters:
###### n_estimators=10,
###### max_depth=20,
###### random_state=1,
###### n_jobs=-1 (optional)
###### Get the feature importance information from this model
###### What's the most important feature (among these 4)? 

In [143]:
rfr= RandomForestRegressor(max_depth=20, n_estimators=10, random_state=1, n_jobs=-1)

In [144]:
feture_importance = rfr.feature_importances_

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [145]:
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

NameError: name 'importance' is not defined