In [151]:
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report  
from sklearn.model_selection import train_test_split

np.random.seed(1)

In [152]:
df = pd.read_csv('C:/Users/Srinidhi/Documents/USF/Data_Science_Programming/Week3_Assignments/RidingMowers.csv')
df.head(3)

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner


In [153]:
df.shape

(24, 3)

In [180]:
from sklearn import preprocessing

#label encoding the target variable
labelencoder = preprocessing.LabelEncoder()
df['Ownership'] = labelencoder.fit_transform(df['Ownership'])

In [155]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

In [156]:
#splitting the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)# Train a SVC model using different kernal


In [158]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### Fit a SVM classification model using linear kernal

In [159]:
svm_lin_model = SVC(kernel="linear", probability =True)
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [160]:
model_preds = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"linear svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0


### Fit a SVM classification model using rbf kernal

In [161]:
svm_rbf_model = SVC(kernel="rbf", probability =True, C=10, gamma='scale')
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [162]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.75,0.666667,0.666667,0.666667


### Fit a SVM classification model using Polynomial kernal

In [163]:
svm_poly_model = SVC(kernel="poly", probability =True, degree=3, coef0=1, C=10)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [164]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,linear svm,1.0,1.0,1.0,1.0
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8


### Model performance based on Accuracy

In [165]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


### Model performance based on Precision

In [166]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,linear svm,1.0,1.0,1.0,1.0
0,poly svm,0.875,1.0,0.666667,0.8


### Model performance based on Recall

In [167]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


### Model performance based on F1

In [168]:
performance.sort_values(by=['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,rbf svm,0.75,0.666667,0.666667,0.666667
0,poly svm,0.875,1.0,0.666667,0.8
0,linear svm,1.0,1.0,1.0,1.0


Based on the above, we have split the data into 30% test and 70% train. We notice that in most of the cases, based on the classification model performances the linear SVM is overfitting. This is because of the fact that the dataset is too small to make a perfect split. I have also noticed an issue of underfitting at 60-40% split. 

Hence based on the above scores alone: Precision, Accuracy, Recall and F1 score, I can confirm that poly svm is the best performing model.

We will be further validating this using the cross validation and grid search efforts by tuning the parameters further to find out the best estimators and performance models. I will consider 3 folds cross validation and use gridsearch to tune my hyperparameters. But this part is not yet covered in its entirity in the class.

### SVM Gridsearch and Hyperparameter tuning using Gridsearch with a cross validation set to 3

In [169]:
#Apply kernels to transform the data to a higher dimension


kernels = ['Polynomial', 'RBF','Linear']
def getClassifier(ktype):
    if ktype == 0:
        # Polynomial kernal
        return SVC(kernel='poly', gamma="auto")
    elif ktype == 1:
        # Radial Basis Function kernal
        return SVC(kernel='rbf', gamma="auto")
    elif ktype == 2:
        # Linear kernal
        return SVC(kernel='linear', gamma="auto")

In [170]:
#Call the SVC() model from sklearn and fit the model to the training data
for i in range(3):
    # Separate data into test and training sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)# Train a SVC model using different kernal
    svclassifier = getClassifier(i) 
    svclassifier.fit(X_train, y_train)# Make prediction
    y_pred = svclassifier.predict(X_test)# Evaluate our model
    print("Evaluation:", kernels[i], "kernel")
    print(classification_report(y_test,y_pred))


Evaluation: Polynomial kernel
              precision    recall  f1-score   support

           0       0.75      0.75      0.75         4
           1       0.75      0.75      0.75         4

    accuracy                           0.75         8
   macro avg       0.75      0.75      0.75         8
weighted avg       0.75      0.75      0.75         8

Evaluation: RBF kernel
              precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.57      1.00      0.73         4

    accuracy                           0.62         8
   macro avg       0.79      0.62      0.56         8
weighted avg       0.79      0.62      0.56         8

Evaluation: Linear kernel
              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       1.00      0.50      0.67         2

    accuracy                           0.88         8
   macro avg       0.93      0.75      0.79      

From the above grid results, it is easily shown that the Linear SVC is showing the best results for the observed values. 

To further evaluate this, we can tune the hyperparameters to find the best parameters and best performing model.

In [171]:
from sklearn.model_selection import GridSearchCV #Importing GridSearchCV : used to evaluate the best model combinitions and for hyperparameter tuning.

In [172]:
param_grid = [{"kernel": ["linear"], "C": [0.0001, 0.001, 0.1, 1, 10, 100, 1000]},{"kernel": ["poly"], "degree": [2, 3, 4], "C": [0.0001, 0.001, 0.1, 1, 10, 100, 1000]}, {"kernel": ["rbf"],"gamma": ["auto", "scale"],"C": [0.0001, 0.001, 0.1, 1, 10, 100, 1000]}] #Create a dictionary called param_grid and fill out some parameters for kernels, C and gamma


In [173]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2, cv=3, scoring='accuracy')
grid.fit(X_train,y_train)

Fitting 3 folds for each of 42 candidates, totalling 126 fits
[CV] END ............................C=0.0001, kernel=linear; total time=   0.0s
[CV] END ............................C=0.0001, kernel=linear; total time=   0.0s
[CV] END ............................C=0.0001, kernel=linear; total time=   0.0s
[CV] END .............................C=0.001, kernel=linear; total time=   0.0s
[CV] END .............................C=0.001, kernel=linear; total time=   0.0s
[CV] END .............................C=0.001, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END ...............................C=0.1, kernel=linear; total time=   0.0s
[CV] END .................................C=1, kernel=linear; total time=   0.0s
[CV] END .................................C=1, kernel=linear; total time=   0.0s
[CV] END .................................C=1, 

GridSearchCV(cv=3, estimator=SVC(),
             param_grid=[{'C': [0.0001, 0.001, 0.1, 1, 10, 100, 1000],
                          'kernel': ['linear']},
                         {'C': [0.0001, 0.001, 0.1, 1, 10, 100, 1000],
                          'degree': [2, 3, 4], 'kernel': ['poly']},
                         {'C': [0.0001, 0.001, 0.1, 1, 10, 100, 1000],
                          'gamma': ['auto', 'scale'], 'kernel': ['rbf']}],
             scoring='accuracy', verbose=2)

In [174]:
print(grid.best_estimator_) #Finding the optimal parameters

SVC(C=1000, degree=4, kernel='poly')


In [175]:
# print best parameter after tuning
print(grid.best_params_)

{'C': 1000, 'degree': 4, 'kernel': 'poly'}


In [176]:
best_result = grid.best_score_    #Grid best score
print(best_result)

0.8111111111111112


As per the above, it can be determined that the Polynomial kernel is being the best estimater so far with the best parameters as shown under best_estimator results.The accuracy score of the best parameters prediction is 81%.

<b>Conclusion</b>: Based on the results when the CV of 3 folds evaluates the best performing while using the grid search with optimized parameters, we see that overall, polynomial svm appears to be working the best.

But since we are considering the model performance based on the precision, recall, accuracy and F1 scores alone, based on the observed values for the classification model performance for SVM, polynomial kernel is always at the top for accuracy, precision, recall and f1 score if we neglect the linear svm model which is overfitting. Hence considering poly to be the best performing model.

In [179]:
#Saving the winning model using pickle to a csv file.

import pickle

pickle.dump(svm_poly_model, open('C:/Users/Srinidhi/Documents/USF/Data_Science_Programming/Week3_Assignments/RidingMowers_pickle.csv', 'wb'))



