In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, f1_score, roc_curve, roc_auc_score, RocCurveDisplay, auc
from collections import Counter
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.pipeline import Pipeline


def convert(data):
    number = preprocessing.LabelEncoder()
    data['operator'] = number.fit_transform(data.operator)
    data['methodReturn'] = number.fit_transform(data.methodReturn)
    data['isKilled'] = number.fit_transform(data.isKilled)
    return data

# Read in data and display first 5 rows
features = pd.read_csv('training_data.csv',encoding= 'unicode_escape')
#print(features)
#print('The shape of our features is:', features.shape)

#Convert string to float
features=convert(features)
print(features)
print('The shape of our features is:', features.shape) 

# Use numpy to convert to arrays
# Labels are the values we want to predict
labels = np.array(features['isKilled'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('isKilled', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

        DepthTree  NumSubclass  McCabe  LOC  DepthNested   CA  CE  \
0               2            0       1    3            1  364  16   
1               2            0       1    3            1  364  16   
2               2            0       1    3            1  364  16   
3               2            0       1    3            1  364  16   
4               2            0       1    3            1  364  16   
...           ...          ...     ...  ...          ...  ...  ..   
591808          3            0       1    3            1    1  11   
591809          2            0       1    3            1    1  11   
591810          3            0       1    3            1    1  11   
591811          1            0       1    3            1    1  11   
591812          2            0       1    3            1    1  11   

        Instability  numCovered  operator  methodReturn  numTestsCover  \
0             0.042          12         7             7             17   
1             0.042    

In [2]:
##Input test data
positive_prob = []
test = pd.read_csv('testing_data.csv',encoding= 'unicode_escape')

#Convert string to float
test=convert(test)

# ## Labels are the values we want to predict
test_labels = np.array(test['isKilled'])

# # Remove the labels from the features
# # axis 1 refers to the columns
test= test.drop('isKilled', axis = 1)
# # Saving feature names for later use
feature_list = list(test.columns)
# # Convert to numpy array
test = np.array(test)

In [3]:

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.5,random_state=0)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(295906, 14) (295906,)
(295907, 14) (295907,)


In [4]:
logreg = LogisticRegression(solver='lbfgs', penalty='none', class_weight=None, max_iter=1000)

#default = lbfgs 0.941 for training 0.891 for testing
#No change in using l2 regularization
#Using balanced class weight: 0.917 for training 0.849 for testing


#liblinear:  0.798 for training, 0.639 for testing
#l1 regularization: 0.800 for training, 0.690 for testing

#sag: 0.93 for training 0.844 for testing

#saga: 0.939 for training 0.867 for testing - reduces after increasing max_iter
#no change using l1 regularization
#no change using l2 regularization
#elasticnet -> does not apply

#newton-cg: line search algorithm does not converge

logreg.fit(X_train, y_train)
y_pred_train_logreg = logreg.predict(X_test)
print(accuracy_score(y_test, y_pred_train_logreg))
print(confusion_matrix(y_test, y_pred_train_logreg))

y_pred_test = logreg.predict(test)

print(accuracy_score(test_labels, y_pred_test))
confusion = confusion_matrix(test_labels, y_pred_test)
print("Confusion_matrix:\n",confusion)
print(precision_recall_fscore_support(test_labels, y_pred_test, average='micro'))
print(f1_score(test_labels, y_pred_test, average='micro'))




0.9409882158921553
[[217779  17337]
 [   125  60666]]
0.8915766505939714
Confusion_matrix:
 [[76882 16100]
 [    0 55510]]
(0.8915766505939714, 0.8915766505939714, 0.8915766505939714, None)
0.8915766505939714


Using Recursive Feature Elimination

In [79]:
print("------------------------- RFE with 6 features ----------------------------------")
rfe6 = RFE(estimator=logreg, n_features_to_select=6)
pipeline6 = Pipeline(steps=[('s',rfe6),('m',logreg)])

pipeline6.fit(X_train,y_train)

print("********* CROSS VALIDATION *************")
y_pred_train_logreg = pipeline6.predict(X_test)
print(accuracy_score(y_test, y_pred_train_logreg))
print(confusion_matrix(y_test, y_pred_train_logreg))


print("********* TEST *************")
y_pred_test = pipeline6.predict(test)

print(accuracy_score(test_labels, y_pred_test))
confusion = confusion_matrix(test_labels, y_pred_test)
print("Confusion_matrix:\n",confusion)
print(precision_recall_fscore_support(test_labels, y_pred_test, average='micro'))
print(f1_score(test_labels, y_pred_test, average='micro'))

important_features = rfe6.get_feature_names_out()
print(important_features)

------------------------- RFE with 6 features ----------------------------------
********* CROSS VALIDATION *************
0.9419243208170135
[[218056  17060]
 [   125  60666]]
********* TEST *************
0.892957196347278
Confusion_matrix:
 [[77087 15895]
 [    0 55510]]
(0.892957196347278, 0.892957196347278, 0.892957196347278, None)
0.892957196347278
['x2' 'x3' 'x5' 'x6' 'x8' 'x10']


OPTIMAL?

In [80]:
print("------------------------- RFE with 8 features ----------------------------------")
rfe4 = RFE(estimator=logreg, n_features_to_select=8)
pipeline4 = Pipeline(steps=[('s',rfe4),('m',logreg)])

pipeline4.fit(X_train,y_train)

print("********* CROSS VALIDATION *************")
y_pred_train_logreg = pipeline4.predict(X_test)
print(accuracy_score(y_test, y_pred_train_logreg))
print(confusion_matrix(y_test, y_pred_train_logreg))


print("********* TEST *************")
y_pred_test = pipeline4.predict(test)

print(accuracy_score(test_labels, y_pred_test))
confusion = confusion_matrix(test_labels, y_pred_test)
print("Confusion_matrix:\n",confusion)
print(precision_recall_fscore_support(test_labels, y_pred_test, average='micro'))
print(f1_score(test_labels, y_pred_test, average='micro'))

important_features = rfe4.get_feature_names_out()
print(important_features)

------------------------- RFE with 8 features ----------------------------------
********* CROSS VALIDATION *************
0.9419243208170135
[[218056  17060]
 [   125  60666]]
********* TEST *************
0.892957196347278
Confusion_matrix:
 [[77087 15895]
 [    0 55510]]
(0.892957196347278, 0.892957196347278, 0.892957196347278, None)
0.892957196347278
['x2' 'x3' 'x5' 'x6' 'x8' 'x9' 'x10' 'x13']


In [5]:
print("------------------------- RFE with 3 features ----------------------------------")
rfe3 = RFE(estimator=logreg, n_features_to_select=1)
pipeline3 = Pipeline(steps=[('s',rfe3),('m',logreg)])

pipeline3.fit(X_train,y_train)

print("********* CROSS VALIDATION *************")
y_pred_train_logreg = pipeline3.predict(X_test)
print(accuracy_score(y_test, y_pred_train_logreg))
print(confusion_matrix(y_test, y_pred_train_logreg))


print("********* TEST *************")
y_pred_test = pipeline3.predict(test)

print(accuracy_score(test_labels, y_pred_test))
confusion = confusion_matrix(test_labels, y_pred_test)
print("Confusion_matrix:\n",confusion)
print(precision_recall_fscore_support(test_labels, y_pred_test, average='micro'))
print(f1_score(test_labels, y_pred_test, average='micro'))

important_features = rfe3.get_feature_names_out()
print(important_features)

------------------------- RFE with 3 features ----------------------------------
********* CROSS VALIDATION *************
0.9421034311455965
[[217990  17126]
 [     6  60785]]
********* TEST *************
0.892957196347278
Confusion_matrix:
 [[77087 15895]
 [    0 55510]]
(0.892957196347278, 0.892957196347278, 0.892957196347278, None)
0.892957196347278
['x8']


In [13]:
X1_train, X1_test, y1_train, y1_test = train_test_split(features, labels, test_size=0.99,random_state=0)
print (X1_train.shape, y1_train.shape)
print (X1_test.shape, y1_test.shape)

(5918, 14) (5918,)
(585895, 14) (585895,)


In [14]:
logreg = LogisticRegression(solver='lbfgs', penalty='none', class_weight=None, max_iter=1000)


logreg.fit(X1_train, y1_train)
y1_pred_train_logreg = logreg.predict(X1_test)
print(accuracy_score(y1_test, y1_pred_train_logreg))
print(confusion_matrix(y1_test, y1_pred_train_logreg))

y_pred_test = logreg.predict(test)

print(accuracy_score(test_labels, y_pred_test))
confusion = confusion_matrix(test_labels, y_pred_test)
print("Confusion_matrix:\n",confusion)
print(precision_recall_fscore_support(test_labels, y_pred_test, average='micro'))
print(f1_score(test_labels, y_pred_test, average='micro'))

0.8028844758873177
[[458774   6710]
 [108779  11632]]
0.6960981062952887
Confusion_matrix:
 [[89858  3124]
 [42003 13507]]
(0.6960981062952887, 0.6960981062952887, 0.6960981062952887, None)
0.6960981062952887
