In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.datasets import load_boston

In [191]:
boston = load_boston()

In [192]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [193]:
features = pd.DataFrame(boston.data,columns=boston.feature_names)
target = boston.target

In [194]:
features.shape, target.shape

((506, 13), (506,))

In [195]:
bostonDataFrame = features.copy()
bostonDataFrame["Price"] = boston.target
bostonDataFrame.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [None]:
#pip install pandas_profiling==2.8.0

Collecting pandas_profiling==2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/b9/94/ef8ef4517540d13406fcc0b8adfd75336e014242c69bd4162ab46931f36a/pandas_profiling-2.8.0-py2.py3-none-any.whl (259kB)
[K     |████████████████████████████████| 266kB 4.9MB/s 
[?25hCollecting tangled-up-in-unicode>=0.0.6
[?25l  Downloading https://files.pythonhosted.org/packages/4a/e2/e588ab9298d4989ce7fdb2b97d18aac878d99dbdc379a4476a09d9271b68/tangled_up_in_unicode-0.0.6-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 8.4MB/s 
Collecting tqdm>=4.43.0
[?25l  Downloading https://files.pythonhosted.org/packages/73/d5/f220e0c69b2f346b5649b66abebb391df1a00a59997a7ccf823325bd7a3e/tqdm-4.49.0-py2.py3-none-any.whl (69kB)
[K     |████████████████████████████████| 71kB 6.9MB/s 
Collecting visions[type_image_path]==0.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/4a/03/5a45d542257830cf1d9da2cdc1c0bc6f55a9212937b70fdd6d7031b46d6c/visions-0.4.4-py3-none-any

In [None]:
'''
import pandas_profiling
bostonReport = pandas_profiling.ProfileReport(bostonDataFrame)
bostonReport.to_file("bostonReport.html")
'''

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=28.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




# **EDA via Pandas profiling report**

Features TAX & RAD are highly correlated, so we can ignore 1 column

Feature ZN has 73.5 % Zeros, so we can exclude this feature instead of computing

In [196]:
features.drop(["RAD","ZN"],inplace=True,axis=1)

In [197]:
features.head()

Unnamed: 0,CRIM,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
0,0.00632,2.31,0.0,0.538,6.575,65.2,4.09,296.0,15.3,396.9,4.98
1,0.02731,7.07,0.0,0.469,6.421,78.9,4.9671,242.0,17.8,396.9,9.14
2,0.02729,7.07,0.0,0.469,7.185,61.1,4.9671,242.0,17.8,392.83,4.03
3,0.03237,2.18,0.0,0.458,6.998,45.8,6.0622,222.0,18.7,394.63,2.94
4,0.06905,2.18,0.0,0.458,7.147,54.2,6.0622,222.0,18.7,396.9,5.33


In [198]:
features.describe()

Unnamed: 0,CRIM,INDUS,CHAS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,408.237154,18.455534,356.674032,12.653063
std,8.601545,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.46,0.0,0.385,3.561,2.9,1.1296,187.0,12.6,0.32,1.73
25%,0.082045,5.19,0.0,0.449,5.8855,45.025,2.100175,279.0,17.4,375.3775,6.95
50%,0.25651,9.69,0.0,0.538,6.2085,77.5,3.20745,330.0,19.05,391.44,11.36
75%,3.677083,18.1,0.0,0.624,6.6235,94.075,5.188425,666.0,20.2,396.225,16.955
max,88.9762,27.74,1.0,0.871,8.78,100.0,12.1265,711.0,22.0,396.9,37.97


# **Scaling the features**

In [199]:
scaler = StandardScaler()

In [200]:
scaled_features = scaler.fit_transform(features)

In [201]:
x_train, x_test, y_train, y_test = train_test_split(scaled_features,target,test_size=0.2,random_state=42)

# **HyperParameter Tuning**

In [202]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [203]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [204]:
rf = RandomForestRegressor()

In [205]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   50.3s finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [206]:
print(rf_random.best_params_)

{'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': False}


In [207]:
print(rf_random.best_score_)

0.8302715263236897


In [208]:
rf_random1 = RandomForestRegressor(n_estimators=400,max_features='sqrt',max_depth=60,min_samples_split=10,min_samples_leaf=1,bootstrap=False,random_state=42)

In [209]:
rf_random1.fit(x_train,y_train)

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                      max_depth=60, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [210]:
rf_random1.score(x_test,y_test)

0.8727773295676874

In [211]:
y_predict = rf_random1.predict(x_test)

In [212]:
from sklearn.metrics import mean_squared_error,r2_score
print("R2 value:" , r2_score(y_test,y_predict))
print("MSE vlaue:" , mean_squared_error(y_test,y_predict))

R2 value: 0.8727773295676874
MSE vlaue: 9.329719726560445
