## Importing Libraries

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error

## Loading Data

In [3]:
data = pd.read_csv("boston.csv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  506 non-null    int64  
 1   CRIM        506 non-null    float64
 2   ZN          506 non-null    float64
 3   INDUS       506 non-null    float64
 4   CHAS        506 non-null    float64
 5   NOX         506 non-null    float64
 6   RM          506 non-null    float64
 7   AGE         506 non-null    float64
 8   DIS         506 non-null    float64
 9   RAD         506 non-null    float64
 10  TAX         506 non-null    float64
 11  PTRATIO     506 non-null    float64
 12  B           506 non-null    float64
 13  LSTAT       506 non-null    float64
 14  Price       506 non-null    float64
dtypes: float64(14), int64(1)
memory usage: 59.4 KB


In [6]:
data.describe()

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,252.5,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,146.213884,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.0,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,126.25,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,252.5,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,378.75,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,505.0,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


## Data Spliting

In [7]:
x = data.drop(["Price"], axis = 1)
x

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [8]:
y = data["Price"]
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: Price, Length: 506, dtype: float64

In [9]:
# normalize our data
sc_x = StandardScaler()
x = sc_x.fit_transform(x)
x

array([[-1.72863116, -0.41978194,  0.28482986, ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-1.72178509, -0.41733926, -0.48772236, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-1.71493903, -0.41734159, -0.48772236, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [ 1.71493903, -0.41344658, -0.48772236, ...,  1.17646583,
         0.44105193, -0.98304761],
       [ 1.72178509, -0.40776407, -0.48772236, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [ 1.72863116, -0.41500016, -0.48772236, ...,  1.17646583,
         0.44105193, -0.66905833]])

In [10]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [11]:
xtrain.shape

(354, 14)

## Decision Tree Regression

In [12]:
dtr = DecisionTreeRegressor()
dtr.fit(xtrain, ytrain)
dtr

In [13]:
y_pred = dtr.predict(xtest)

In [14]:
y_pred

array([21.6, 32. , 15.2, 24.8, 17.7, 20.5, 22.5, 17.8, 23. , 21.2, 21.7,
       16. , 10.5, 21.4, 16.2, 25. , 17.3,  7.2, 48.3, 14.6, 25. , 23.1,
       16.6, 23.8, 16.7, 14.6, 21.2, 14.1, 16. , 24.3, 21.7, 23.1, 50. ,
       15. , 17.7, 13.1, 33.1, 18.2, 22.4, 24.8, 19.8, 28.4, 48.3, 18.7,
       22. ,  8.3, 16.6, 24.8, 19.8, 32.5, 22. , 34.9, 16.6, 28.4, 43.1,
       20.2, 15.4, 22.8, 23.9, 22.4, 24.8, 32.7, 29.4, 19.3, 26.6, 14.4,
       12.5, 23. , 22.8, 14.1, 24.7, 28.7,  9.5, 23.7, 28.1,  5.6, 19.8,
       44. , 10.2, 11.5, 22. ,  6.3, 17.5,  6.3, 20.3, 32. , 14.9, 23.1,
       28.7, 18. , 23.3,  7.5, 19.2, 22.9, 23.3, 19.2, 50. , 10.5, 15.4,
        7.2, 22.2, 28.1, 14.6, 20.4, 17.5,  8.3, 19.3, 24.8, 19.6, 28.7,
        8.4, 12.1, 22.2, 21.4, 31.5, 13.5, 50. , 14.8, 16.1, 23.7, 16.2,
       24.8,  5. , 18.5, 24.7, 23.1, 28.7, 37.2, 17.8, 46. , 13.3, 24.1,
       21.7, 19.4, 14.6, 20.2, 19.6, 31.1, 29.8, 16.6, 20.4, 23.7, 22. ,
       14.4,  5.6, 21.4, 13.1, 14.9, 16.6, 44.8, 14

In [15]:
ytest

173    23.6
274    32.4
491    13.6
72     22.8
452    16.1
       ... 
441    17.1
23     14.5
225    50.0
433    14.3
447    12.6
Name: Price, Length: 152, dtype: float64

In [16]:
mean_squared_error(y_pred, ytest)

22.77743421052632

## Random Forest Regression

In [25]:
from sklearn.ensemble import RandomForestRegressor

ref_reg = RandomForestRegressor(n_estimators = 500)
ref_reg.fit(xtrain, ytrain)

In [26]:
y_pred = ref_reg.predict(xtest)

In [27]:
mean_squared_error(ytest, y_pred)

9.921098225263163