# Boston Housing Classification SVM Evaluation

In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
sys.path.append("..")

In [2]:
inputFile = "../data/Boston_Housing_Data.csv"

## Read the data into DataFrame

In [3]:
df = pd.read_csv(inputFile,delimiter=";")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
 14  CAT      506 non-null    int64  
dtypes: float64(11), int64(4)
memory usage: 59.4 KB
None


In [4]:
df_features = df.drop(["MEDV","CAT"],axis=1) # drop label attribute from the features
df_labels = df[["CAT"]].copy()
display(df_features)
display(df_labels)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


Unnamed: 0,CAT
0,0
1,0
2,1
3,1
4,1
...,...
501,0
502,0
503,0
504,0


## Train test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_features,df_labels,test_size=0.3,random_state=1234)
display (X_train)
display (X_test) 
display (y_train)
display (y_test)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
99,0.06860,0.0,2.89,0,0.4450,7.416,62.5,3.4952,2,276,18.0,396.90,6.19
102,0.22876,0.0,8.56,0,0.5200,6.405,85.4,2.7147,5,384,20.9,70.80,10.63
416,10.83420,0.0,18.10,0,0.6790,6.782,90.8,1.8195,24,666,20.2,21.57,25.79
266,0.78570,20.0,3.97,0,0.6470,7.014,84.6,2.1329,5,264,13.0,384.07,14.79
101,0.11432,0.0,8.56,0,0.5200,6.781,71.3,2.8561,5,384,20.9,395.58,7.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
204,0.02009,95.0,2.68,0,0.4161,8.034,31.9,5.1180,4,224,14.7,390.55,2.88
53,0.04981,21.0,5.64,0,0.4390,5.998,21.4,6.8147,4,243,16.8,396.90,8.43
294,0.08199,0.0,13.92,0,0.4370,6.009,42.3,5.5027,4,289,16.0,396.90,10.40
211,0.37578,0.0,10.59,1,0.4890,5.404,88.6,3.6650,4,277,18.6,395.24,23.98


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
64,0.01951,17.5,1.38,0,0.4161,7.104,59.5,9.2229,3,216,18.6,393.24,8.05
100,0.14866,0.0,8.56,0,0.5200,6.727,79.9,2.7778,5,384,20.9,394.76,9.42
400,25.04610,0.0,18.10,0,0.6930,5.987,100.0,1.5888,24,666,20.2,396.90,26.77
485,3.67367,0.0,18.10,0,0.5830,6.312,51.9,3.9917,24,666,20.2,388.62,10.58
454,9.51363,0.0,18.10,0,0.7130,6.728,94.1,2.4961,24,666,20.2,6.68,18.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,0.36920,0.0,9.90,0,0.5440,6.567,87.3,3.6023,4,304,18.4,395.69,9.28
287,0.03871,52.5,5.32,0,0.4050,6.209,31.3,7.3172,6,293,16.6,396.90,7.14
384,20.08490,0.0,18.10,0,0.7000,4.368,91.2,1.4395,24,666,20.2,285.83,30.63
108,0.12802,0.0,8.56,0,0.5200,6.474,97.1,2.4329,5,384,20.9,395.24,12.27


Unnamed: 0,CAT
99,1
102,0
416,0
266,1
101,0
...,...
204,1
53,0
294,0
211,0


Unnamed: 0,CAT
64,1
100,0
400,0
485,0
454,0
...,...
314,0
287,0
384,0
108,0


## Pipelining

Support Vector Machine Classifier

In [6]:
# balancing the instances with class weights
lsvc = LinearSVC(class_weight="balanced",dual='auto') 


Build the pipeline

In [7]:
scaler = StandardScaler()
pipeline = make_pipeline(scaler, lsvc)
print (pipeline)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(class_weight='balanced'))])


Build the paramGrid

In [8]:
param_grid = dict(linearsvc__max_iter=[11000,12000,15000],\
                linearsvc__C=[1.0, 0.5, 0.1],\
                linearsvc__loss=["hinge","squared_hinge"])                 \
                                 

Build the CrossValidator 

In [9]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid,cv=5, scoring="accuracy",return_train_score=True)
grid_search.fit(X_train, y_train["CAT"])

## Show best Model 

In [10]:
print (grid_search.best_params_)
print (grid_search.best_estimator_)
print (grid_search.best_score_)

{'linearsvc__C': 1.0, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 11000}
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc',
                 LinearSVC(class_weight='balanced', max_iter=11000))])
0.9180281690140845


## Evaluate the model

In [11]:
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print("Test Error = " ,(1.0 - accuracy))

Test Error =  0.06578947368421051


## Confusion Matrix 

In [12]:
cm =confusion_matrix(y_test,y_pred,labels=[0,1])
print(cm)
tn, fp, fn, tp = cm.ravel()
print("True Positives (Pred. = 0 & Label = 0) %f " %tp)
print("True Negatives (Pred. = 1 & Label = 1) %f " %tn)
print("False Positives (Pred. = 0 & Label = 1) %f " %fp)
print("False Negatives (Pred = 0 & Label = 0) %f " %fn)

[[116   8]
 [  2  26]]
True Positives (Pred. = 0 & Label = 0) 26.000000 
True Negatives (Pred. = 1 & Label = 1) 116.000000 
False Positives (Pred. = 0 & Label = 1) 8.000000 
False Negatives (Pred = 0 & Label = 0) 2.000000 


### Area under ROC

In [13]:
print(roc_auc_score(y_test,y_pred))

0.9320276497695854


### F1

In [14]:
f1_macro = f1_score(y_test,y_pred,average="macro")
print("Macro F1 ",f1_macro)
f1_w= f1_score(y_test,y_pred,average="weighted")
print("Weighted F1 ",f1_w)


Macro F1  0.8986936816848841
Weighted F1  0.9365783159578499


## Summary stats

In [15]:
print (precision_recall_fscore_support(y_test,y_pred,average="macro"))

print (precision_recall_fscore_support(y_test,y_pred,average=None))




(0.8738783649052841, 0.9320276497695852, 0.8986936816848841, None)
(array([0.98305085, 0.76470588]), array([0.93548387, 0.92857143]), array([0.95867769, 0.83870968]), array([124,  28]))
