In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, f1_score, roc_curve, roc_auc_score, RocCurveDisplay, auc
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier


def convert(data):
    number = preprocessing.LabelEncoder()
    data['operator'] = number.fit_transform(data.operator)
    data['methodReturn'] = number.fit_transform(data.methodReturn)
    data['isKilled'] = number.fit_transform(data.isKilled)
    return data

# Read in data and display first 5 rows
features = pd.read_csv('training_data.csv',encoding= 'unicode_escape')
#print(features)
#print('The shape of our features is:', features.shape)

#Convert string to float
features=convert(features)
print(features)
print('The shape of our features is:', features.shape) 

# Use numpy to convert to arrays
# Labels are the values we want to predict
labels = np.array(features['isKilled'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('isKilled', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

        DepthTree  NumSubclass  McCabe  LOC  DepthNested   CA  CE  \
0               2            0       1    3            1  364  16   
1               2            0       1    3            1  364  16   
2               2            0       1    3            1  364  16   
3               2            0       1    3            1  364  16   
4               2            0       1    3            1  364  16   
...           ...          ...     ...  ...          ...  ...  ..   
591808          3            0       1    3            1    1  11   
591809          2            0       1    3            1    1  11   
591810          3            0       1    3            1    1  11   
591811          1            0       1    3            1    1  11   
591812          2            0       1    3            1    1  11   

        Instability  numCovered  operator  methodReturn  numTestsCover  \
0             0.042          12         7             7             17   
1             0.042    

In [2]:
##Input test data
positive_prob = []
test = pd.read_csv('testing_data.csv',encoding= 'unicode_escape')

#Convert string to float
test=convert(test)

# ## Labels are the values we want to predict
test_labels = np.array(test['isKilled'])

# # Remove the labels from the features
# # axis 1 refers to the columns
test= test.drop('isKilled', axis = 1)
# # Saving feature names for later use
feature_list = list(test.columns)
# # Convert to numpy array
test = np.array(test)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5,random_state=0)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(295906, 14) (295906,)
(295907, 14) (295907,)


In [10]:
logregSGD = SGDClassifier(loss="log")

logregSGD.fit(X_train, y_train)
y_pred_train_logreg = logregSGD.predict(X_test)
print(accuracy_score(y_test, y_pred_train_logreg))
print(confusion_matrix(y_test, y_pred_train_logreg))

y_pred_test = logregSGD.predict(test)

print(accuracy_score(test_labels, y_pred_test))
confusion = confusion_matrix(test_labels, y_pred_test)
print("Confusion_matrix:\n",confusion)
print(precision_recall_fscore_support(test_labels, y_pred_test, average='micro'))
print(f1_score(test_labels, y_pred_test, average='micro'))


0.9420223245817098
[[217966  17150]
 [     6  60785]]
0.8925733372841634
Confusion_matrix:
 [[77030 15952]
 [    0 55510]]
(0.8925733372841634, 0.8925733372841634, 0.8925733372841634, None)
0.8925733372841634


In [16]:
rfe = RFE(estimator=logregSGD, n_features_to_select=6)
rfe_sgd_pipeline = Pipeline(steps=[('r',rfe),('m',logregSGD)])

rfe_sgd_pipeline.fit(X_train,y_train)

print("********* CROSS VALIDATION *************")
y_pred_train_logreg = rfe_sgd_pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred_train_logreg))
print(confusion_matrix(y_test, y_pred_train_logreg))


print("********* TEST *************")
y_pred_test = rfe_sgd_pipeline.predict(test)

print(accuracy_score(test_labels, y_pred_test))
confusion = confusion_matrix(test_labels, y_pred_test)
print("Confusion_matrix:\n",confusion)
print(precision_recall_fscore_support(test_labels, y_pred_test, average='micro'))
print(f1_score(test_labels, y_pred_test, average='micro'))

important_features = rfe.get_feature_names_out()
print(important_features)

********* CROSS VALIDATION *************
0.9421034311455965
[[217990  17126]
 [     6  60785]]
********* TEST *************
0.892957196347278
Confusion_matrix:
 [[77087 15895]
 [    0 55510]]
(0.892957196347278, 0.892957196347278, 0.892957196347278, None)
0.892957196347278
['x0' 'x1' 'x3' 'x4' 'x5' 'x8']
