In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

  import pandas.util.testing as tm


In [3]:
from sklearn.datasets import load_boston

In [4]:
boston = load_boston()
print(boston['DESCR']);

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
boston = pd.DataFrame(boston['data'],columns=boston['feature_names'])

In [6]:
boston['Target'] = pd.Series(load_boston()['target'])

In [7]:
#size of sample
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [8]:
X = boston.drop("Target",axis=1)
y = boston['Target']

In [9]:
from sklearn.model_selection import train_test_split 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Choosing the right estimator

Ridge Regression Model

In [11]:
from sklearn.linear_model import Ridge

np.random.seed(42)
model = Ridge()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.6662221670168522

Random Forest Regressor

In [12]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8896648705127477

#Evaluating the model

In [13]:
y_pred = model.predict(X_test)

**R^2**

In [14]:
from sklearn.metrics import r2_score

In [15]:
print(f"R-squred value: {r2_score(y_test,y_pred):.2%}")

R-squred value: 88.97%


In [16]:
print(f"R-squred value: {model.score(X_test,y_test):.2%}")

R-squred value: 88.97%


**Mean Absolute Error**

In [17]:
from sklearn.metrics import mean_absolute_error


In [20]:
print(f"Mean Absolute Error: {mean_absolute_error(y_test,model.predict(X_test)):.2f}")

Mean Absolute Error: 2.05


**Mean Squared Root Error**

In [28]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [29]:
print(f"Mean Squared Root Error: {sqrt(mean_squared_error(y_test,y_pred)):.2f}")

Mean Squared Root Error: 2.84


Evaluate Model using Scoring Parameter

**R Squared**

In [35]:
from sklearn.model_selection import cross_val_score

<generator object <genexpr> at 0x7f838d2c97d8>


In [58]:
print("R Squared:\n {:.0%}".format(cross_val_score(model,X,y).mean()))
print(["{:.0%}".format(i) for i in cross_val_score(model,X,y)])

R Squared:
 62%
['77%', '86%', '74%', '47%', '31%']


**Mean Absolute Error**

In [64]:
print("Mean Absolute Error:\n {:.2f}".format(-cross_val_score(model,X,y,scoring='neg_mean_absolute_error').mean()))
print(["{:.2f}".format(-i) for i in cross_val_score(model,X,y,scoring='neg_mean_absolute_error')])

Mean Absolute Error:
 3.01
['2.11', '2.66', '3.37', '3.73', '3.10']


**Mean Squared Root Error**

In [65]:
print("Mean Absolute Error:\n {:.2f}".format(sqrt(-cross_val_score(model,X,y,scoring='neg_mean_squared_error').mean())))
print(["{:.2f}".format(sqrt(-i)) for i in cross_val_score(model,X,y,scoring='neg_mean_squared_error')])

Mean Absolute Error:
 4.66
['2.88', '3.66', '4.52', '6.75', '4.19']
