In [None]:
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

print(df.head())

X = df.drop('target', axis=1)
y = df['target']

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  target  
0    -122.23   4.526  
1    -122.22   3.585  
2    -122.24   3.521  
3    -122.25   3.413  
4    -122.25   3.422  


In [14]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

model = RandomForestRegressor(random_state=1)

model.fit(train_X, train_y)

predictions = model.predict(val_X)
mae = mean_absolute_error(predictions, val_y)

print(mae)

0.3317725336821707


# First Optimization: Max number of leaves

In [None]:
def get_mae(max_number_of_leaves, train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(max_leaf_nodes=max_number_of_leaves, random_state=1)
    model.fit(train_X, train_y)
    predictions = model.predict(val_X)
    return mean_absolute_error(val_y, predictions)

In [None]:
scores = {max_number: get_mae(max_number, train_X, val_X, train_y, val_y) for max_number in [5, 50, 500, 5000]}
print(scores)    

{5: 0.6111935999227869, 50: 0.4367870719399872, 500: 0.3538815036221463, 5000: 0.3308410005839568}


In [None]:
print(get_mae(10000, train_X, val_X, train_y, val_y))

0.3307828782364343


In [None]:
print(get_mae(6000, train_X, val_X, train_y, val_y))    

0.33075140254771396


# Second Optimization: Data Cleaning

In [16]:
train_X_clean = train_X.fillna(train_X.median())

model = RandomForestRegressor(max_leaf_nodes=6000, random_state=1)
model.fit(train_X_clean, train_y)
predictions = model.predict(val_X)

print(mean_absolute_error(val_y, predictions))

0.33075140254771396


In [15]:
print(train_X.isnull().sum().sum())

0
