Linear regression using Sci-Kit Learn library.

In [1]:
import pandas as pd
import numpy as np
housing_df = pd.read_csv("../data/housing.csv")
housing_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [2]:
x_rows = housing_df[['CRIM', 'TAX', 'RM']]
x_rows.head()


Unnamed: 0,CRIM,TAX,RM
0,0.00632,296.0,6.575
1,0.02731,242.0,6.421
2,0.02729,242.0,7.185
3,0.03237,222.0,6.998
4,0.06905,222.0,7.147


In [3]:
y_labels = housing_df["MEDV"]
y_labels.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_rows, y_labels, test_size=0.3, random_state=1)
x_train


Unnamed: 0,CRIM,TAX,RM
13,0.62976,307.0,5.949
61,0.17171,284.0,5.966
377,9.82349,666.0,6.794
39,0.02763,252.0,6.595
365,4.55587,666.0,3.561
...,...,...,...
255,0.03548,315.0,5.876
72,0.09164,305.0,6.065
396,5.87205,666.0,6.405
235,0.33045,307.0,6.086


In [5]:
x_test


Unnamed: 0,CRIM,TAX,RM
307,0.04932,222.0,6.849
343,0.02543,370.0,6.696
47,0.22927,233.0,6.030
67,0.05789,345.0,5.878
362,3.67822,666.0,5.362
...,...,...,...
467,4.42228,666.0,6.003
95,0.12204,276.0,6.625
122,0.09299,188.0,5.961
260,0.54011,264.0,7.203


In [6]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [7]:
print(f"Score of how good model predicts training set is equal to {model.score(x_train,y_train)}")
print(f"Score of how good model predicts testing set is equal to {model.score(x_test,y_test)}")

Score of how good model predicts training set is equal to 0.5096603576929334
Score of how good model predicts testing set is equal to 0.6901893330926419


In [25]:
def calculate_MAPE(model, x,y):
    y_predicted = model.predict(x)
    return 100 * np.mean(np.abs((y - y_predicted) / y))


In [27]:
print("Mape value for training set is {:.3f}%".format(calculate_MAPE(model,x_train,y_train)))
print("Mape value for testing set is {:.3f}%".format(calculate_MAPE(model,x_test,y_test)))

Mape value for training set is 21.552%
Mape value for training set is 20.784%


Since margin error in example above is pretty high ~ 20 % then  we'll try to use Random forest regressor


In [32]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train,y_train)
print("Mape value for training set is {:.3f}%".format(calculate_MAPE(model,x_train,y_train)))
print("Mape value for testing set is {:.3f}%".format(calculate_MAPE(model,x_test,y_test)))

# Training set mape is way lower, that means we've overfed our model. To avoid that we can change min_samples_leaf parameter in Random Forest Regressor

model = RandomForestRegressor(min_samples_leaf=15)
model.fit(x_train,y_train)
print("Mape value for training set is {:.3f}%".format(calculate_MAPE(model,x_train,y_train)))
print("Mape value for testing set is {:.3f}%".format(calculate_MAPE(model,x_test,y_test)))

Mape value for training set is 7.331%
Mape value for testing set is 16.732%
Mape value for training set is 17.274%
Mape value for testing set is 18.947%
