# Evaluating SVM on Multiple Datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

from ipywidgets import *
from IPython.display import display

from sklearn.svm import SVC

%matplotlib inline

### Breast Cancer Dataset

In [2]:
df_bc = pd.read_csv('./breast_cancer.csv', na_values='?')

print(df_bc.shape)
df_bc.head()

(699, 11)


Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2


#### EDA

In [3]:
df_bc.isnull().sum()

Sample_code_number              0
Clump_Thickness                 0
Uniformity_of_Cell_Size         0
Uniformity_of_Cell_Shape        0
Marginal_Adhesion               0
Single_Epithelial_Cell_Size     0
Bare_Nuclei                    16
Bland_Chromatin                 0
Normal_Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [4]:
# Drop Nans
df_bc.dropna(inplace=True)

In [5]:
df_bc['Class'].unique()

array([2, 4])

There's 2 unique classes so we'll change the values to 0 for 2 and 1 for 4.

In [6]:
df_bc['Class'] = df_bc['Class'].map(lambda x: 1 if x == 4 else 0)

# Setup feature matrix and target vector
X = df_bc.drop('Class', axis=1)
y = df_bc['Class']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, \
                                                   random_state=1337)

y_train.mean()

0.3553113553113553

Baseline accuracy is ~64%. 

In [7]:
# Scale features
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

#### Model

In [8]:
def eval_model(model):
    '''Prints crossfold mean and standard deviation, confusion matrix
        and classification report from X_train, X_test, y_train, y_test
        defined outside the function.  '''
    scores = cross_val_score(model, X_train, y_train, cv=5)
    yhat = model.predict(X_test)
    
    print('Crossfold mean:', scores.mean())
    print('Crossfold std:', scores.std())
    print('')
    
    conf_table = pd.crosstab(y_test, yhat, rownames=['Actual'], colnames=['Predicted'], margins=True)
    print(conf_table)
    print('')
    print(classification_report(y_test, yhat))
    print('Accuracy:', model.score(X_test, y_test))

In [9]:
lin_model = SVC(kernel='linear')
lin_model.fit(X_train, y_train)

eval_model(lin_model)

Crossfold mean: 0.961567046613
Crossfold std: 0.0133619881916

Predicted   0   1  All
Actual                
0          89   3   92
1           2  43   45
All        91  46  137

             precision    recall  f1-score   support

          0       0.98      0.97      0.97        92
          1       0.93      0.96      0.95        45

avg / total       0.96      0.96      0.96       137

Accuracy: 0.963503649635


In [10]:
rbf_model = SVC(kernel='rbf')
rbf_model.fit(X_train, y_train)

eval_model(rbf_model)

Crossfold mean: 0.970725295771
Crossfold std: 0.0146687124117

Predicted   0   1  All
Actual                
0          89   3   92
1           2  43   45
All        91  46  137

             precision    recall  f1-score   support

          0       0.98      0.97      0.97        92
          1       0.93      0.96      0.95        45

avg / total       0.96      0.96      0.96       137

Accuracy: 0.963503649635


In the breast cancer dataset example, the linear and rbf models were identical in predictive power. This may be indicative that there was a clear seperation between most of the data points except for the 5 misclassified points.

### Car dataset

In [11]:
df_car = pd.read_csv('./car.csv')

print(df_car.shape)
df_car.head()

(1728, 7)


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


#### EDA

In [12]:
df_car.isnull().sum()

buying           0
maint            0
doors            0
persons          0
lug_boot         0
safety           0
acceptability    0
dtype: int64

In [13]:
# Print unique values for all columns
for col in df_car.columns:
    print(df_car[col].unique())

['vhigh' 'high' 'med' 'low']
['vhigh' 'high' 'med' 'low']
['2' '3' '4' '5more']
['2' '4' 'more']
['small' 'med' 'big']
['low' 'med' 'high']
['unacc' 'acc' 'vgood' 'good']


We'll binarize the acceptability column and make this a binary classification problem. 1 for 'vgood', and 'good' then 0 for 'unacc', and 'acc'.

In [14]:
df_car['acceptability'] = df_car['acceptability'].map(lambda x: 1 if x in ['vgood', 'good'] else 0)

# Set feature matrix and target vector
X = df_car.drop('acceptability', axis=1)
y = df_car['acceptability']

X = pd.get_dummies(X, X.columns)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, \
                                                   random_state=1337)

In [15]:
y_train.value_counts() / len(y_train)

0    0.918234
1    0.081766
Name: acceptability, dtype: float64

Baseline accuracy is ~92%.

#### Model

In [16]:
lin_model = SVC(kernel='linear')
lin_model.fit(X_train, y_train)

print('Linear Model:')
eval_model(lin_model)

Linear Model:
Crossfold mean: 0.975393577907
Crossfold std: 0.00953671236549

Predicted    0   1  All
Actual                 
0          323   2  325
1            4  17   21
All        327  19  346

             precision    recall  f1-score   support

          0       0.99      0.99      0.99       325
          1       0.89      0.81      0.85        21

avg / total       0.98      0.98      0.98       346

Accuracy: 0.982658959538


In [17]:
rbf_model = SVC(kernel='rbf')
rbf_model.fit(X_train, y_train)

print('RBF Model:')
eval_model(rbf_model)

RBF Model:
Crossfold mean: 0.952246933311
Crossfold std: 0.00950742234325

Predicted    0   1  All
Actual                 
0          325   0  325
1           11  10   21
All        336  10  346

             precision    recall  f1-score   support

          0       0.97      1.00      0.98       325
          1       1.00      0.48      0.65        21

avg / total       0.97      0.97      0.96       346

Accuracy: 0.968208092486


In this car dataset example, the linear model did marginally better than the rbf model in terms of overall accuracy. Through the confusion matrices we can see both models did well at predicting class good acceptability, but struggled to predict the bad acceptability ratings- in fact the rbf model had a recall of less than 50%. 