#### Use house_boston dataset. Construct a prediction model to predict the price of the house based on given features. Use GBRegressor to construct a regression model and analyze the performance of your model.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/boston_housing.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
X = data.drop(labels='MEDV', axis=1)
y = data['MEDV']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [5]:
display(X_train.describe())
print(X_train.info())

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.609125,11.569307,10.98505,0.071782,0.556484,6.315891,68.556436,3.808195,9.356436,404.032178,18.318317,356.278342,12.457351
std,8.875058,23.152481,6.894618,0.258447,0.117704,0.709452,27.994922,2.131226,8.589721,166.172655,2.228701,91.566533,7.110381
min,0.00906,0.0,0.74,0.0,0.385,3.863,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.081437,0.0,5.13,0.0,0.452,5.8905,45.55,2.087875,4.0,279.0,16.8,375.4725,6.7725
50%,0.26139,0.0,8.56,0.0,0.538,6.21,77.7,3.17575,5.0,330.0,18.7,391.305,10.925
75%,3.202962,20.0,18.1,0.0,0.631,6.63675,93.65,5.4008,12.0,666.0,20.2,395.755,16.3725
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


<class 'pandas.core.frame.DataFrame'>
Int64Index: 404 entries, 477 to 102
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     404 non-null    float64
 1   ZN       404 non-null    float64
 2   INDUS    404 non-null    float64
 3   CHAS     404 non-null    float64
 4   NOX      404 non-null    float64
 5   RM       404 non-null    float64
 6   AGE      404 non-null    float64
 7   DIS      404 non-null    float64
 8   RAD      404 non-null    float64
 9   TAX      404 non-null    float64
 10  PTRATIO  404 non-null    float64
 11  B        404 non-null    float64
 12  LSTAT    404 non-null    float64
dtypes: float64(13)
memory usage: 44.2 KB
None


In [6]:
preproc_pipe = Pipeline(steps=[('scaler', StandardScaler())])

In [7]:
X_train_proc = preproc_pipe.fit_transform(X_train, y_train)
X_test_proc = preproc_pipe.transform(X_test)

In [8]:
gbr = GradientBoostingRegressor(learning_rate=0.01, n_estimators=1000)

In [9]:
params = dict(learning_rate = np.arange(0.01, 0.11, 0.01),
            n_estimators = np.arange(200, 1200, 200))

gscv = GridSearchCV(gbr, params, scoring='r2', cv=5, n_jobs=-1)

In [10]:
gscv.fit(X_train_proc, y_train)
gscv.best_estimator_, gscv.best_score_

(GradientBoostingRegressor(learning_rate=0.09, n_estimators=200),
 0.8561989579854343)

In [11]:
y_pred = gscv.predict(X_test_proc)
print('R2 Score : {0:.2f}'.format(r2_score(y_test, y_pred)))

R2 Score : 0.91
