In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
dataset=pd.read_csv("boston.csv")
dataset.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


 Data Cleaning and analyzing

In [21]:
dataset.shape


(506, 13)

In [4]:
dataset.isnull().sum().sum()

np.int64(0)

In [5]:
correlation=dataset.corr()["MEDV"]
correlation

CRIM      -0.388305
ZN         0.360445
INDUS     -0.483725
CHAS       0.175260
NOX       -0.427321
RM         0.695360
AGE       -0.376955
DIS        0.249929
RAD       -0.381626
TAX       -0.468536
PTRATIO   -0.507787
B          0.333461
LSTAT     -0.737663
MEDV       1.000000
Name: MEDV, dtype: float64

In [6]:
dataset.drop("B",axis=1,inplace=True) #dropping axis B because it has low correlation with output and also presence of outlier

Data Seperation

In [7]:
x=dataset.iloc[:,:-1]
y=dataset[["MEDV"]]

Feature Scalling

In [8]:
ss=StandardScaler()
x=pd.DataFrame(ss.fit_transform(x),columns=x.columns)

Train Test Split of Data

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42,test_size=0.2)

In [10]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((404, 12), (102, 12), (404, 1), (102, 1))

In [11]:
x_train["CHAS"].value_counts()

CHAS
-0.272599    375
 3.668398     29
Name: count, dtype: int64

Fitting the model

In [12]:
model=RandomForestRegressor(criterion="poisson",n_estimators=14,random_state=42)
model.fit(x_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [13]:
model.score(x_test,y_test)*100,model.score(x_train,y_train)*100

(92.35216344029268, 97.35075862430196)

Calculating error of the model

In [14]:
mean_squared_error(y_test,model.predict(x_test)),mean_squared_error(y_train,model.predict(x_train))

(5.608447879151661, 2.3014861588199635)

HyperParameter Tuining Using GridSearchCV to improve accuracy of model

In [15]:
df = {
    "criterion":["firedman_mse", "absolute_error", "friedman_mse", "poisson"],
    "n_estimators":[i for i in range(1,20)]
    }

In [None]:
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=df, cv=3, n_jobs=2, verbose=1)
grid_search.fit(x_train, y_train)

In [17]:
grid_search.best_params_

{'criterion': 'poisson', 'n_estimators': 8}

Creating Pipeline to Automate the model Prediction

In [18]:
pipe=Pipeline([
    ("ss",StandardScaler()),
    ("model",RandomForestRegressor(criterion="poisson",n_estimators=14,random_state=42))
])

In [19]:
pipe.fit(x_train,y_train)

  return fit_method(estimator, *args, **kwargs)


Predicting the model

In [20]:
pipe.predict([[0.171842,-0.487722,1.015999,-0.272599,1.367490,0.017617,0.825898,-0.678277,1.661245,1.530926,0.806576,0.647173]])



array([15.02857143])