In [9]:
# This script is written to implement support vector machine (SVM)
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns; sns.set()
import pandas as pd
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

df = pd.read_csv('morpho_data.csv')
features = list(df.drop("target", axis=1))
X = df.drop("target", axis=1)
Y = df['target']
#X = StandardScaler.fit_transform(X)


X_train, X_test, Y_train, Y_test = train_test_split(X, df['target'], test_size = 0.3, random_state =4)

scaler = StandardScaler()
#cols = X_train.columns
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

#X_train_new = pd.DataFrame(X_train_std, columns = [cols])
#X_test_new = pd.DataFrame(X_test_std, columns = [cols])


In [10]:
# Instantiate classifier with default hyperparameters
svc = SVC()
svc.fit(X_train, Y_train)
y_pred = svc.predict(X_test)
print('Model accuracy score with default hyperparameters: {0:0.4f}'.format(accuracy_score(Y_test, y_pred)))


Model accuracy score with default hyperparameters: 0.9371


In [13]:
# Explore the cost function
# Run SVM with rbf kernel and C = 1000.00
svc.fit(X_train, Y_train)
Y_pred=svc.predict(X_test)
print('Model accuracy score with rbf kernel and C=1000.00 : {0:0.4f}'.format(accuracy_score(Y_test, Y_pred)))



Model accuracy score with rbf kernel and C=1000.00 : 0.9371


In [12]:
# Run SVM with linear kernel
linear_svc = SVC(kernel='linear', C=1.0)
linear_svc.fit(X_train, Y_train)
Y_pred_test = linear_svc.predict(X_test)
print('Model accuracy score with linear kernel and C=1.0:{0:0.4f}'.format(accuracy_score(Y_test, Y_pred_test)))

Model accuracy score with linear kernel and C=1.0:0.8868


In [None]:
# Run SVM with linear kernel and C=100.00
linear_svc100 = SVC(kernel='linear', C=100.00)
linear_svc100.fit(X_train, Y_train)
Y_pred = linear_svc100.predict(X_test)
print('Model accuracy score with linear kernel and C=100.0:{0:0.4f}'.format(accuracy_score(Y_test, Y_pred)))


In [None]:
# Run SVM with linear kernel and C=1000.00
linear_svc1000=SVC(kernel='linear', C = 1000)
linear_svc1000.fit(X_train, Y_train)
Y_pred = linear_svc1000.predict(X_test)
print('Model accuracy score with linear kernel and C=1000:{0:0.4f}'.format(accuracy_score(Y_test, Y_pred)))

In [None]:
# Compare the train-set and test-set accuracy
# Now, I will compare the train-set and test-set accuracy to check for overfitting
Y_pred_train = linear_svc.predict(X_train)
print('Training-set accuracy score: {0:0.4f}'.format(accuracy_score(Y_train, Y_pred_train)))

In [None]:
# instantiate classifier with sigmoid kernel and C=100.0
sigmoid_svc100=SVC(kernel='sigmoid', C=100.0) 


# fit classifier to training set
sigmoid_svc100.fit(X_train,Y_train)


# make predictions on test set
Y_pred=sigmoid_svc100.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with sigmoid kernel and C=100.0 : {0:0.4f}'. format(accuracy_score(Y_test, Y_pred)))


In [5]:
# instantiate classifier with polynomial kernel and C=1.0
poly_svc=SVC(kernel='poly', C=1.0) 


# fit classifier to training set
poly_svc.fit(X_train,Y_train)


# make predictions on test set
Y_pred=poly_svc.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with polynomial kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(Y_test, Y_pred)))

Model accuracy score with polynomial kernel and C=1.0 : 0.3836


In [14]:
# Print the Confusion Matrix and slice it into four pieces

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y_test, Y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

Confusion matrix

 [[23  1  0  0  0]
 [ 3  3  2  0  1]
 [ 0  0 45  0  2]
 [ 1  0  0 17  0]
 [ 0  0  0  0 61]]

True Positives(TP) =  23

True Negatives(TN) =  3

False Positives(FP) =  1

False Negatives(FN) =  3


In [16]:
# Classification report
from sklearn.metrics import classification_report

print(classification_report(Y_test, Y_pred))




              precision    recall  f1-score   support

    S_macrop       0.85      0.96      0.90        24
    S_nepale       0.75      0.33      0.46         9
    S_progra       0.96      0.96      0.96        47
    S_raraen       1.00      0.94      0.97        18
    S_richar       0.95      1.00      0.98        61

    accuracy                           0.94       159
   macro avg       0.90      0.84      0.85       159
weighted avg       0.93      0.94      0.93       159



In [19]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

# print classification accuracy

classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

# print classification error

classification_error = (FP + FN) / float(TP + TN + FP + FN)
print('Classification error : {0:0.4f}'.format(classification_error))


Classification accuracy : 0.8667
Classification error : 0.1333


In [20]:
# print precision score

precision = TP / float(TP + FP)


print('Precision : {0:0.4f}'.format(precision))
Precision : 0.9949
    
    
    

Precision : 0.9583


In [21]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))

Recall or Sensitivity : 0.8846


In [22]:
true_positive_rate = TP / float(TP + FN)


print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))

True Positive Rate : 0.8846


In [23]:
false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

False Positive Rate : 0.2500


In [24]:
specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

Specificity : 0.7500


In [26]:
# plot ROC Curve

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)

plt.figure(figsize=(6,4))

plt.plot(fpr, tpr, linewidth=2)

plt.plot([0,1], [0,1], 'k--' )

plt.rcParams['font.size'] = 12

plt.title('ROC curve for Predicting a Pulsar Star classifier')

plt.xlabel('False Positive Rate (1 - Specificity)')

plt.ylabel('True Positive Rate (Sensitivity)')

plt.show()

ValueError: multiclass format is not supported