### WE03 SVM

## 1. Setup

Import Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
np.random.seed(1)

In [2]:
# import os
# print(os.getcwd())

In [3]:
# Load the Data
data = pd.read_csv('RidingMowers.csv')
data.head(4)

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner


In [4]:
data.dtypes

Income       float64
Lot_Size     float64
Ownership     object
dtype: object

In [5]:
data['Ownership'].unique()

array(['Owner', 'Nonowner'], dtype=object)

## Data Exploration

In [6]:
data.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     object 
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [8]:
data.Ownership.value_counts()

Owner       12
Nonowner    12
Name: Ownership, dtype: int64

In [9]:
import summarytools
from summarytools import dfSummary
dfSummary(data)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Income [float64],Mean (sd) : 68.4 (19.8) min < med < max: 33.0 < 64.8 < 110.1 IQR (CV) : 30.8 (3.5),22 distinct values,,0 (0.0%)
2,Lot_Size [float64],Mean (sd) : 19.0 (2.4) min < med < max: 14.0 < 19.0 < 23.6 IQR (CV) : 3.3 (7.8),18 distinct values,,0 (0.0%)
3,Ownership [object],1. Owner 2. Nonowner,12 (50.0%) 12 (50.0%),,0 (0.0%)


In [10]:
#Checking missing values
data.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

## Data Encode

In [11]:
dummies_data = pd.get_dummies(data['Ownership'], prefix = 'Ownership', drop_first=True)

In [12]:
df = data.join(dummies_data)
df.drop('Ownership', axis = 1, inplace = True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Income           24 non-null     float64
 1   Lot_Size         24 non-null     float64
 2   Ownership_Owner  24 non-null     uint8  
dtypes: float64(2), uint8(1)
memory usage: 536.0 bytes


## Normalization

In [14]:
df[['Income', 'Lot_Size']] = df[['Income', 'Lot_Size']].apply(lambda iterator: ((iterator - iterator.mean())/iterator.std()).round(2))
df

Unnamed: 0,Income,Lot_Size,Ownership_Owner
0,-0.43,-0.23,1
1,0.86,-0.89,1
2,-0.18,1.09,1
3,-0.35,0.76,1
4,0.94,1.91,1
5,2.1,0.1,1
6,2.0,-0.56,1
7,0.73,1.42,1
8,0.03,0.43,1
9,1.24,0.76,1


In [15]:
X = df[['Income','Lot_Size']]
y = df[['Ownership_Owner']]

## Train Test SPLIT

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test , y_train , y_test = train_test_split(X,y,test_size = 0.3)

## Modelling the DATA

In [26]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 3.1 Fit a SVM classification model using linear kernal

In [35]:
svm_lin_model = SVC(kernel="linear",probability = True)
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [36]:
model_preds = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### 3.2 Fit a SVM classification model using rbf kernal

In [29]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale')
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [30]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### 3.3 Fit a SVM classification model using polynomial kernal

In [31]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10, probability = True)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [32]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [33]:
### Summary

performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,0.875,0.75,1.0,0.857143
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,0.75,1.0,0.857143


In [68]:
df['predict_Prob'] = svm_poly_model.predict_proba(X)[:,1]
df.head(3)

Unnamed: 0,Income,Lot_Size,Ownership_Owner,predict_Prob
0,-0.43,-0.23,1,0.578737
1,0.86,-0.89,1,0.569973
2,-0.18,1.09,1,0.553024


### Save model to disk

In [37]:
import pickle

# save model
pickle.dump(svm_lin_model, open('Srikar_svm_winning_model.pkl', "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))