In [1]:
# Pandas is used for data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, f1_score, roc_curve, roc_auc_score, RocCurveDisplay, auc
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm

def convert(data):
    number = preprocessing.LabelEncoder()
    data['operator'] = number.fit_transform(data.operator)
    data['methodReturn'] = number.fit_transform(data.methodReturn)
    data['isKilled'] = number.fit_transform(data.isKilled)
    return data

# Read in data and display first 5 rows
features = pd.read_csv('py_files/training_data.csv',encoding= 'unicode_escape')
#print(features)
#print('The shape of our features is:', features.shape)

#Convert string to float
features=convert(features)
#print(features)
#print('The shape of our features is:', features.shape) 

# Use numpy to convert to arrays
# Labels are the values we want to predict
labels = np.array(features['isKilled'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('isKilled', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [2]:
##Validation set create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(414269, 14) (414269,)
(177544, 14) (177544,)


In [None]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
linear = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
rbf_0.1 = svm.SVC(kernel='rbf', gamma=0.1, C=1).fit(X_train, y_train)
rbf_1 = svm.SVC(kernel='rbf', gamma=1, C=1).fit(X_train, y_train)
rbf_5 = svm.SVC(kernel='rbf', gamma=5, C=1).fit(X_train, y_train)
rbf_10 = svm.SVC(kernel='rbf', gamma=10, C=1).fit(X_train, y_train)
poly_1= svm.SVC(kernel='poly', degree=1, C=1).fit(X_train, y_train)
poly_2 = svm.SVC(kernel='poly', degree=2, C=1).fit(X_train, y_train)
poly_3 = svm.SVC(kernel='poly', degree=3, C=1).fit(X_train, y_train)
poly_4 = svm.SVC(kernel='poly', degree=4, C=1).fit(X_train, y_train)
poly_5 = svm.SVC(kernel='poly', degree=5, C=1).fit(X_train, y_train)
sig = svm.SVC(kernel='sigmoid', C=1).fit(X_train, y_train)

#Predict the response for test dataset
y_pred_linear= linear.predict(X_test)
y_pred_rbf_0.1= rbf_0.1.predict(X_test)
y_pred_rbf_1= rbf_1.predict(X_test)
y_pred_rbf_5= rbf_5.predict(X_test)
y_pred_rbf_10= rbf_10.predict(X_test)
y_pred_poly_1= poly.predict(X_test)
y_pred_poly_2= poly.predict(X_test)
y_pred_poly_3= poly.predict(X_test)
y_pred_poly_4= poly.predict(X_test)
y_pred_poly_5= poly.predict(X_test)
y_pred_sig= sig.predict(X_test)

print("SVM Linear kernel Accuracy:",accuracy_score(y_test, y_pred_linear))
print("SVM rbf kernel gamma=0.1 Accuracy:",accuracy_score(y_test, y_pred_rbf_0.1))
print("SVM rbf kernel gamma=1 Accuracy:",accuracy_score(y_test, y_pred_rbf_1))
print("SVM rbf kernel gamma=5 Accuracy:",accuracy_score(y_test, y_pred_rbf_5))
print("SVM rbf kernel gamma=10 Accuracy:",accuracy_score(y_test, y_pred_rbf_10))
print("SVM polynomial kernel degree=1 Accuracy:",accuracy_score(y_test, y_pred_poly_1))
print("SVM polynomial kernel degree=2 Accuracy:",accuracy_score(y_test, y_pred_poly_2))
print("SVM polynomial kernel degree=3 Accuracy:",accuracy_score(y_test, y_pred_poly_3))
print("SVM polynomial kernel degree=4 Accuracy:",accuracy_score(y_test, y_pred_poly_4))
print("SVM polynomial kernel degree=5 Accuracy:",accuracy_score(y_test, y_pred_poly_5))
print("SVM sigmoid Accuracy:",accuracy_score(y_test, y_pred_sig))

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_linear = linear.feature_importances_
linear_importances = pd.Series(importances_linear, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
linear_importances.plot.bar(ax=ax)
ax.set_title("SVM:Linear Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_rbf_0.1 = rbf_0.1.feature_importances_
rbf_0.1_importances = pd.Series(importances_rbf_0.1, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
rbf_0.1_importances.plot.bar(ax=ax)
ax.set_title("SVM:rbf:0.1 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_rbf_1 = rbf_1.feature_importances_
rbf_1_importances = pd.Series(importances_rbf_1, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
rbf_1_importances.plot.bar(ax=ax)
ax.set_title("SVM:rbf:1 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_rbf_5 = rbf_5.feature_importances_
rbf_5_importances = pd.Series(importances_rbf_5, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
rbf_5_importances.plot.bar(ax=ax)
ax.set_title("SVM:rbf:5 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_rbf_10 = rbf_10.feature_importances_
rbf_10_importances = pd.Series(importances_rbf_10, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
rbf_10_importances.plot.bar(ax=ax)
ax.set_title("SVM:rbf:10 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_poly_1 = poly_1.feature_importances_
poly_1_importances = pd.Series(importances_poly_1, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
poly_1_importances.plot.bar(ax=ax)
ax.set_title("SVM:poly:degree:1 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_poly_2 = poly_2.feature_importances_
poly_2_importances = pd.Series(importances_poly_2, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
poly_2_importances.plot.bar(ax=ax)
ax.set_title("SVM:poly:degree:2 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_poly_3 = poly_3.feature_importances_
poly_3_importances = pd.Series(importances_poly_3, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
poly_3_importances.plot.bar(ax=ax)
ax.set_title("SVM:poly:degree:3 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_poly_4 = poly_4.feature_importances_
poly_4_importances = pd.Series(importances_poly_4, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
poly_4_importances.plot.bar(ax=ax)
ax.set_title("SVM:poly:degree:4 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_poly_5 = poly_5.feature_importances_
poly_5_importances = pd.Series(importances_poly_5, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
poly_5_importances.plot.bar(ax=ax)
ax.set_title("SVM:poly:degree:5 Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Plot feature importance
feature_names = [f"features {i}" for i in range(features.shape[1])]
feature_names = ['DepthTree', 'NumSubclass', 'McCabe', 'LOC','DepthNested','CA','CE','Instability','numCovered','operator','methodReturn','numTestsCover','mutantAssert','classAssert']
importances_sig = sig.feature_importances_
sig_importances = pd.Series(importances_sig, index=feature_names).sort_values(ascending=False)

#Feature importance
fig, ax = plt.subplots()
sig_importances.plot.bar(ax=ax)
ax.set_title("SVM:sigmoid Feature importances")
ax.set_ylabel("Merit")
fig.tight_layout()

In [None]:
##Input test data
test = pd.read_csv('py_files/testing_data.csv',encoding= 'unicode_escape')

#Convert string to float
test=convert(test)

# ## Labels are the values we want to predict
test_labels = np.array(test['isKilled'])

# # Remove the labels from the features
# # axis 1 refers to the columns
test= test.drop('isKilled', axis = 1)
# # Saving feature names for later use
feature_list = list(test.columns)
# # Convert to numpy array
test = np.array(test)

In [None]:
#Predict test
y_pred_test_linear = linear.predict(test)
y_pred_test_rbf_0.1= rbf_0.1.predict(test)
y_pred_test_rbf_1= rbf_1.predict(test)
y_pred_test_rbf_5= rbf_5.predict(test)
y_pred_test_rbf_10= rbf_10.predict(test)
y_pred_test_poly_1= poly.predict(test)
y_pred_test_poly_2= poly.predict(test)
y_pred_test_poly_3= poly.predict(test)
y_pred_test_poly_4= poly.predict(test)
y_pred_test_poly_5= poly.predict(test)
y_pred_test_sig= sig.predict(test)


print("SVM Linear kernel Accuracy on test:", accuracy_score(test_labels, y_pred_test_linear))
print("SVM rbf kernel gamma=0.1 Accuracy on test:",accuracy_score(test_labels, y_pred_test_rbf_0.1))
print("SVM rbf kernel gamma=1 Accuracy on test:",accuracy_score(test_labels, y_pred_test_rbf_1))
print("SVM rbf kernel gamma=5 Accuracy on test:",accuracy_score(test_labels, y_pred_test_rbf_5))
print("SVM rbf kernel gamma=10 Accuracy on test:",accuracy_score(test_labels, y_pred_test_rbf_10))
print("SVM polynomial kernel degree=1 Accuracy on test:",accuracy_score(test_labels, y_pred_test_poly_1))
print("SVM polynomial kernel degree=2 Accuracy on test:",accuracy_score(test_labels, y_pred_test_poly_2))
print("SVM polynomial kernel degree=3 Accuracy on test:",accuracy_score(test_labels, y_pred_test_poly_3))
print("SVM polynomial kernel degree=4 Accuracy on test:",accuracy_score(test_labels, y_pred_test_poly_4))
print("SVM polynomial kernel degree=5 Accuracy on test:",accuracy_score(test_labels, y_pred_test_poly_5))
print("SVM sigmoid Accuracy on test:",accuracy_score(test_labels, y_pred_test_sig))