## Loading libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

## Loading data

In [2]:
housing = fetch_california_housing()
housing
X = pd.DataFrame(housing['data'], columns = housing['feature_names'])
y = pd.DataFrame(housing['target'], columns = ['Price'])
display(X.head())
display(y.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


Unnamed: 0,Price
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


## Training - test split

In [3]:
from sklearn.model_selection import train_test_split

X = X.drop(columns=["Latitude","Longitude"])
y = y['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.columns = X.columns
X_test.columns = X.columns

## Gradient boosting

In [4]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()

gb.fit(X_train,y_train)

y_train_pred = gb.predict(X_train)
y_test_pred  = gb.predict(X_test)

print("The R2 of the model in the TRAIN set is: {:.2f}".format(gb.score(X_train,y_train)))
print("The R2 of the model in the TEST  set is: {:.2f}".format(gb.score(X_test,y_test)))

The R2 of the model in the TRAIN set is: 0.72
The R2 of the model in the TEST  set is: 0.67


## Playing with the parameters



In [5]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 150], # 3 * 2 * 2 * 2 * 1 = 3 * 8 = 24
    'min_samples_split': [2, 4], 
    'min_samples_leaf' : [1, 2],
    'max_depth':[3,5],
    'max_features': ['sqrt'] # round(sqrt(#cols))
    }

gb = GradientBoostingRegressor(random_state=100)

grid_search = GridSearchCV(gb, param_grid, cv=5,return_train_score=True,n_jobs=-1, verbose = 20)
grid_search.fit(X_train,y_train)
grid_search.best_params_ #To check the best set of parameters returned

Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 150}

In [16]:
grid_search.best_estimator_

## Fitting the model with the best parameter set

In [6]:
y_train.shape

(15480,)

In [11]:
from sklearn.model_selection import cross_val_score

gb = GradientBoostingRegressor(random_state=100, 
                               max_features='sqrt', 
                               min_samples_leaf=2, 
                               min_samples_split=2, 
                               n_estimators=150,
                               max_depth = 5)

cross_val_scores = cross_val_score(gb, X_train, y_train, cv=10)
print("The mean R2 of over the folds was {:.2f}".format(np.mean(cross_val_scores)))

The mean R2 of over the folds was 0.69


In [12]:
cross_val_scores

array([0.7053156 , 0.69792547, 0.69867663, 0.68740998, 0.70986852,
       0.67291244, 0.69055837, 0.69762931, 0.66073958, 0.72878517])

In [13]:
gb.fit(X_train, y_train)
gb.score(X_test,y_test)

0.6851049104363882

In [14]:
y_test.shape

(5160,)