# RidingMowers Ownership Prediction with SVM


## 1. Setup

In [48]:
# Common imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import r2_score
np.random.seed(1)

# 2. Load the data

In [49]:
# Uncomment the following snippet of code to debug problems with finding the .csv file path
# This snippet of code will exit the program and print the current working directory.
#import os
#print(os.getcwd())

In [50]:
RidingMowers = pd.read_csv('RidingMowers.csv')
RidingMowers.head(20)

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner
4,87.0,23.6,Owner
5,110.1,19.2,Owner
6,108.0,17.6,Owner
7,82.8,22.4,Owner
8,69.0,20.0,Owner
9,93.0,20.8,Owner


In [51]:
RidingMowers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     object 
dtypes: float64(2), object(1)
memory usage: 704.0+ bytes


In [52]:
RidingMowers.describe()

Unnamed: 0,Income,Lot_Size
count,24.0,24.0
mean,68.4375,18.95
std,19.793144,2.428275
min,33.0,14.0
25%,52.35,17.5
50%,64.8,19.0
75%,83.1,20.8
max,110.1,23.6


In [53]:
# Check the missing values by summing the total na's for each variable
RidingMowers.isna().sum()

Income       0
Lot_Size     0
Ownership    0
dtype: int64

In [54]:
#Encoding the Target variable using label encoder
labelencoder = LabelEncoder()
RidingMowers['Ownership'] = labelencoder.fit_transform(RidingMowers['Ownership'])

In [55]:
RidingMowers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Income     24 non-null     float64
 1   Lot_Size   24 non-null     float64
 2   Ownership  24 non-null     int32  
dtypes: float64(2), int32(1)
memory usage: 608.0 bytes


In [56]:
RidingMowers.head(27)

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,1
1,85.5,16.8,1
2,64.8,21.6,1
3,61.5,20.8,1
4,87.0,23.6,1
5,110.1,19.2,1
6,108.0,17.6,1
7,82.8,22.4,1
8,69.0,20.0,1
9,93.0,20.8,1


# Split data (train/test)

In [57]:
# split the data into validation and training set
df_train, df_test = train_test_split(RidingMowers, test_size=0.3)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'Ownership'
predictors = list(RidingMowers.columns)
predictors.remove(target)

In [58]:
X_train = df_train[predictors]
y_train = df_train[target] # train_target is now a series objecttrain_df.to_csv('RidingMowers_train_df.csv', index=False)
X_test = df_test[predictors]
y_test = df_test[target] # validation_target is now a series object


## 3. Model the data

First, we will create a dataframe to hold all the results of our models.

In [59]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

# 3.1 Fit a SVM classification model using linear kernal

In [60]:
svm_linear_model = SVC(kernel="linear", probability=True)
_ = svm_linear_model.fit(X_train, np.ravel(y_train))

In [61]:
model_preds = svm_linear_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

# 3.2 Fit a SVM classification model using rbf kernal

In [62]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [63]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

# 3.3 Fit a SVM classification model using polynomial kernal

In [64]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10,probability=True)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [65]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## 4.0 Summary

Sorted by accuracy, the best models are:

In [66]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


Sorted by Precision, the best models are:

# 5.0 Analysis
Based on the above-obtained results we can see that the poly SVM model predicts the output correctly with  85% percent accuracy, precision 100%, Recall with 66%, F call with 80% whereas the linear SVM model predicts the output with 100% accuracy, 100% precision,100% recall,100% F-call may not be considered as the winning model because with 100% accuracy, prediction, recall, F-call the data may become overfit and may not be considered as winning model and also with real-world large data it may or may not be possible for any model to predict accuracy, precision, recall and f-call with 100% accuracy. Hence we can consider the Poly SVM model as the winning model to predict the outputs.

Hence by considering the above result I will go with the poly svm model as my winning model


# 6.0 Winning model predictions

In [67]:
df_test['predicted'] = svm_poly_model.predict(X_test)
df_test.head(10)

Unnamed: 0,Income,Lot_Size,Ownership,predicted
13,52.8,20.8,0,0
18,59.4,16.0,0,0
3,61.5,20.8,1,1
14,64.8,17.2,0,0
20,47.4,16.4,0,0
17,49.2,17.6,0,0
10,51.0,22.0,1,0
4,87.0,23.6,1,1


In [68]:
df_test['pred_prob'] = svm_poly_model.predict_proba(X_test)[:,1]
df_test.head(10)

Unnamed: 0,Income,Lot_Size,Ownership,predicted,pred_prob
13,52.8,20.8,0,0,0.558848
18,59.4,16.0,0,0,0.554622
3,61.5,20.8,1,1,0.562769
14,64.8,17.2,0,0,0.558164
20,47.4,16.4,0,0,0.551064
17,49.2,17.6,0,0,0.553169
10,51.0,22.0,1,0,0.559606
4,87.0,23.6,1,1,0.578904


# Saving the model to disk using pickle
Once you train a model, you want to reused it in other notebooks or applications. You can save the model to disk using the pickle module.

In [71]:
import pickle

# save model
pickle.dump(svm_poly_model, open('C:/Users/Shanthi/Desktop/DSP/WP03-DSP/W03/SVM_poly_model_pickle.pkl', "wb"))

# If you wish to load this model later, simply use pickle.load method
#loaded_model = pickle.load(open('logistic_model_example01.pkl', "rb"))