# Boston Housing Classification Random Forest

In [11]:
import sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
sys.path.append("..")

In [12]:
inputFile = "../data/Boston_Housing_Data.csv"

## DataFrame creation using an ifered Schema 

In [13]:
df = pd.read_csv(inputFile,delimiter=";")

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
 14  CAT      506 non-null    int64  
dtypes: float64(11), int64(4)
memory usage: 59.4 KB
None


## Feature selection

In [14]:
df_features = df.drop(["MEDV","CAT"],axis=1) # drop label attribute from the features
df_labels = df[["CAT"]].copy()
display(df_features)
display(df_labels)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


Unnamed: 0,CAT
0,0
1,0
2,1
3,1
4,1
...,...
501,0
502,0
503,0
504,0


## Data preparation

In [15]:
num_column_names = df_features.select_dtypes(include=np.number).columns.tolist()
df_stand = df_features.copy()

print (num_column_names)
scaler = StandardScaler()
df_stand[num_column_names] = scaler.fit_transform(df_stand[num_column_names])
display(df_stand)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.284830,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459000,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.557160,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.557160,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.416750,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.511180,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,-0.413229,-0.487722,0.115738,-0.272599,0.158124,0.439316,0.018673,-0.625796,-0.982843,-0.803212,1.176466,0.387217,-0.418147
502,-0.415249,-0.487722,0.115738,-0.272599,0.158124,-0.234548,0.288933,-0.716639,-0.982843,-0.803212,1.176466,0.441052,-0.500850
503,-0.413447,-0.487722,0.115738,-0.272599,0.158124,0.984960,0.797449,-0.773684,-0.982843,-0.803212,1.176466,0.441052,-0.983048
504,-0.407764,-0.487722,0.115738,-0.272599,0.158124,0.725672,0.736996,-0.668437,-0.982843,-0.803212,1.176466,0.403225,-0.865302


## Train test split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df_stand,df_labels,test_size=0.3,random_state=1234)
display (X_train)
display (X_test) 
display (y_train)
display (y_test)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
99,-0.412534,-0.487722,-1.203282,-0.272599,-0.947582,1.611810,-0.216027,-0.142536,-0.867883,-0.785394,-0.210622,0.441052,-0.905952
102,-0.393896,-0.487722,-0.375976,-0.272599,-0.299707,0.171480,0.598310,-0.513562,-0.523001,-0.143951,1.130230,-3.134425,-0.283580
416,0.840293,-0.487722,1.015999,-0.272599,1.073787,0.708576,0.790337,-0.939112,1.661245,1.530926,0.806576,-3.674201,1.841454
266,-0.329083,0.370669,-1.045700,-0.272599,0.797361,1.039097,0.569862,-0.790131,-0.523001,-0.856665,-2.522434,0.300379,0.299543
101,-0.407214,-0.487722,-0.375976,-0.272599,-0.299707,0.707152,0.096906,-0.446344,-0.523001,-0.143951,1.130230,0.426579,-0.698495
...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,-0.418179,3.589637,-1.233923,-0.272599,-1.197230,2.492249,-1.304180,0.628893,-0.637962,-1.094237,-1.736418,0.371428,-1.369927
53,-0.414721,0.413589,-0.802031,-0.272599,-0.999412,-0.408356,-1.677566,1.435452,-0.637962,-0.981390,-0.765457,0.441052,-0.591963
294,-0.410976,-0.487722,0.406098,-0.272599,-1.016689,-0.392685,-0.934351,0.811768,-0.637962,-0.708183,-1.135347,0.441052,-0.315820
211,-0.376787,-0.487722,-0.079780,3.668398,-0.567496,-1.254603,0.712104,-0.061818,-0.637962,-0.779455,0.066796,0.422851,1.587740


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
64,-0.418247,0.263370,-1.423605,-0.272599,-1.197230,1.167316,-0.322709,2.580236,-0.752922,-1.141751,0.066796,0.400922,-0.645229
100,-0.403217,-0.487722,-0.375976,-0.272599,-0.299707,0.630220,0.402727,-0.483566,-0.523001,-0.143951,1.130230,0.417588,-0.453191
400,2.494178,-0.487722,1.015999,-0.272599,1.194724,-0.424027,1.117494,-1.048780,1.661245,1.530926,0.806576,0.441052,1.978824
485,0.006999,-0.487722,1.015999,-0.272599,0.244507,0.038987,-0.592969,0.093485,1.661245,1.530926,0.806576,0.350267,-0.290589
454,0.686614,-0.487722,1.015999,-0.272599,1.367490,0.631645,0.907687,-0.617477,1.661245,1.530926,0.806576,-3.837460,0.849024
...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,-0.377552,-0.487722,-0.180458,-0.272599,-0.092387,0.402275,0.665875,-0.091624,-0.637962,-0.619094,-0.025677,0.427785,-0.472815
287,-0.416013,1.765555,-0.848722,-0.272599,-1.293115,-0.107753,-1.325517,1.674325,-0.408041,-0.684426,-0.857929,0.441052,-0.772787
384,1.916827,-0.487722,1.015999,-0.272599,1.255192,-2.730550,0.804561,-1.119752,1.661245,1.530926,0.806576,-0.776759,2.519895
108,-0.405619,-0.487722,-0.375976,-0.272599,-0.299707,0.269781,1.014369,-0.647521,-0.523001,-0.143951,1.130230,0.422851,-0.053695


Unnamed: 0,CAT
99,1
102,0
416,0
266,1
101,0
...,...
204,1
53,0
294,0
211,0


Unnamed: 0,CAT
64,1
100,0
400,0
485,0
454,0
...,...
314,0
287,0
384,0
108,0


## Random Forest Classifier

In [17]:
rf = RandomForestClassifier(n_estimators=200,criterion="gini",max_features="sqrt",random_state=1234,max_depth=5)

Train the model 

In [18]:
rfModel = rf.fit(X_train,y_train["CAT"])

Test the model

In [19]:
y_pred = rfModel.predict(X_test)
print (y_pred)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0
 0 0 0 0]


In [20]:
accuracy = accuracy_score(y_test,y_pred)
print("Test Error = " ,(1.0 - accuracy))

Test Error =  0.03289473684210531
