In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.datasets import load_boston

In [3]:
boston=load_boston()

In [5]:
dir(boston)

['DESCR', 'data', 'feature_names', 'filename', 'target']

In [7]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [8]:
x=pd.DataFrame(boston['data'],columns=boston.feature_names)

In [9]:
x.shape

(506, 13)

In [10]:
x.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [11]:
y=pd.Series(boston['target'])

In [12]:
y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
dtype: float64

In [13]:
y.shape

(506,)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)

In [16]:
x_train.shape

(404, 13)

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

In [31]:
grdb=GradientBoostingRegressor(max_depth=2,n_estimators=100,learning_rate=1)

In [32]:
model=grdb.fit(x_train,y_train)

In [33]:
y_pred=model.predict(x_test)

In [34]:
comp_df=pd.DataFrame({'Actual':y_test,'Predicted':y_pred})

In [35]:
comp_df

Unnamed: 0,Actual,Predicted
195,50.0,51.941366
4,36.2,30.195156
434,11.7,11.925438
458,14.9,14.522910
39,30.8,29.732870
...,...,...
227,31.6,31.065548
405,5.0,12.330574
69,20.9,20.073650
231,31.7,37.592665


In [36]:
from sklearn.metrics import r2_score,mean_squared_error

In [37]:
mse=mean_squared_error(y_test,y_pred)

In [38]:
mse

33.31319687063656

In [39]:
rmse=np.sqrt(mse)

In [40]:
rmse

5.771758559627782

In [41]:
r2_score(y_test,y_pred)

0.7030676393172021

In [43]:
feature_importance=model.feature_importances_

In [45]:
feature_importance

array([2.81979352e-02, 4.95870694e-04, 4.91645042e-03, 2.04385300e-03,
       1.44577627e-02, 5.41077259e-01, 1.88416491e-02, 3.40427543e-02,
       4.45005789e-02, 7.97065529e-03, 1.77664812e-02, 1.15719327e-02,
       2.74116817e-01])

In [47]:
LR={'learning_rate':[0.15,0.10,0.20,0.05],'n_estimators':[100,200,150,250]}

In [48]:
from sklearn.model_selection import GridSearchCV

In [51]:
gd=GridSearchCV(estimator=GradientBoostingRegressor(),param_grid=LR,scoring='r2')

In [52]:
gd.fit(x_train,y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
              

In [53]:
gd.best_params_

{'learning_rate': 0.1, 'n_estimators': 100}

In [54]:
gd.best_score_

0.8667951812058405

In [55]:
grdb1=GradientBoostingRegressor(n_estimators=100,learning_rate=0.1)

In [56]:
model1=grdb1.fit(x_train,y_train)

In [57]:
y_pred1=model1.predict(x_test)

In [58]:
mse1=mean_squared_error(y_test,y_pred1)

In [59]:
mse1

14.425157778173798

In [60]:
rmse1=np.sqrt(mse1)

In [61]:
rmse1

3.7980465739869222

In [62]:
r2_score(y_test,y_pred1)

0.871423443119912

In [63]:
model1.feature_importances_

array([2.70420805e-02, 3.20371903e-04, 3.32718225e-03, 5.04651899e-04,
       3.43534347e-02, 3.31952331e-01, 5.11903354e-03, 8.00331182e-02,
       4.27551671e-03, 1.36542075e-02, 2.39849429e-02, 1.04138959e-02,
       4.65019233e-01])