In [None]:
import numpy as np
# linear algebra
import pandas as pd
# data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
# Select features => Fit features to model => Model prediction => Model validation

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

vulnerability_data = pd.read_csv("/kaggle/input/apachevulnerabilities/finalDataset.csv")
np.set_printoptions(threshold=np.inf)


from sklearn.model_selection import train_test_split

# Creating the training and test data
df = pd.DataFrame(vulnerability_data)
df.dropna()
df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
train, test = train_test_split(df, test_size=0.3)
#print(train)

from sklearn.model_selection import KFold
# prepare cross validation
kfold = KFold(10, shuffle=True, random_state=1)
# enumerate splits

#vulnerability_data.head()
#df.pop('synchronizedFieldsQty')
feature_columns =df.iloc[:,4:52] 
#feature_columns.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
#print(feature_columns.vulnerable)
feature_columns.pop('severity')
feature_columns.pop('title')
feature_columns.pop('version')
feature_columns.pop('vulnerable')

X =train[feature_columns.columns]
test_X = test[feature_columns.columns]
y = train.vulnerable
severity_y = train.severity
title_y = train.title
test_Y = test.vulnerable
severity_test_Y = test.severity
title_test_Y= test.title

In [None]:
def fit_feature_set(feature_set):
    X =train[feature_set_1]
    test_X = test[feature_set_1]
    y = train.vulnerable
    severity_y = train.severity
    title_y = train.title
    test_Y = test.vulnerable
    severity_test_Y = test.severity
    title_test_Y= test.title

In [None]:
# FIRST FEATURE SET SELECTION
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
# Sequential Forward Selection(sfs)
sfs = SFS(LinearRegression(), 
           k_features='best', 
           forward=True, # if forward = True then SFS otherwise SBS
           floating=False, 
           scoring='r2')
sfs.fit(X,y)
feature_set_1 = np.array(sfs.k_feature_names_)
print("Feature set 1 ready...")

In [None]:
# FOR SECOND FEATURE SET
from sklearn.feature_selection import RFE
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=None)
# fit the model
rfe.fit(X, y)
# transform the data
feature_set_2= rfe.fit_transform(X,y)
for i in range(X.shape[1]):
    if(rfe.support_[i] == 1):
        #print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))
        final_features = np.array(X.columns[i])
        print(final_features)
        #np.array(X.columns[i])

In [None]:
# Modern Feature selection techniques
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
#print(X)
X = X.astype(int)
chi2_features = SelectKBest(f_classif, k=3)
best_features = chi2_features.fit_transform(X, y)
print(best_features.scores)

In [None]:
fit_feature_set(feature_set_1)

In [None]:
fit_feature_set(feature_set_2)

In [None]:
# FOR HANDLING THE IMBALANCED VULNERABILITY STATUS
from imblearn.over_sampling  import RandomOverSampler

ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)


In [None]:
# FOR HANDLING THE IMBALANCED VULNERABILITY SEVERITY
from imblearn.over_sampling  import RandomOverSampler

ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, severity_y)


In [None]:
# FOR HANDLING THE IMBALANCED VULNERABILITY TITLE
from imblearn.over_sampling  import RandomOverSampler

ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, title_y)


In [None]:
from sklearn.metrics import confusion_matrix
def plot_confusionmatrix(y_train_pred,y_train,dom):
    print(f'{dom} Confusion matrix')
    cf = confusion_matrix(y_train_pred,y_train)
    sns.heatmap(cf,annot=True,cmap='Blues', fmt='g')
    plt.tight_layout()
    plt.show()

In [None]:
# Decision Tree Classifier (No PRUNING done)
dt_model = DecisionTreeClassifier(random_state=0)
dt_model.fit(X_ros, y_ros)
from sklearn.metrics import accuracy_score
y_train_pred = dt_model.predict(X)
y_test_pred = dt_model.predict(test_X)

print(f'Train score {accuracy_score(y_train_pred,y)}')
print(f'Test score {accuracy_score(y_test_pred,test_Y)}')
from sklearn import tree
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
scores = cross_val_score(dt_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

path = dt_model.cost_complexity_pruning_path(X, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(mean(ccp_alphas))

clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_ros, y_ros)
    clfs.append(clf)
    
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(X)
    y_test_pred = c.predict(test_X)
    train_acc.append(accuracy_score(y_train_pred,y))
    test_acc.append(accuracy_score(y_test_pred,test_Y))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()


In [None]:
def decision_tree_results(X_ros, y_ros, test_Y):
    model = DecisionTreeClassifier(random_state=0,ccp_alpha=0.060)
    model.fit(X_ros, y_ros)

    from sklearn import tree
    from numpy import mean
    from numpy import std
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
    print('Precision: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
    print('Recall: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
    # report performance
    print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
    y_train_pred = model.predict(X_ros)
    y_test_pred = model.predict(test_X)

    print(f'Train score {accuracy_score(y_train_pred,y_ros)}')
    print(f'Test score {accuracy_score(y_test_pred,test_Y)}')
    plot_confusionmatrix(y_train_pred,y_ros,dom='Train')
    from sklearn.metrics import plot_precision_recall_curve
    plot_confusionmatrix(y_test_pred,test_Y,dom='Test')
    plot_precision_recall_curve(model, X_ros, y_ros)

In [None]:
decision_tree_results(X_ros, y_ros, test_Y)

In [None]:
decision_tree_results(X_ros, y_ros, severity_test_Y)

In [None]:
decision_tree_results(X_ros, y_ros, title_test_Y)

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier as KNN
model = KNN()
model.fit(X_ros, y_ros)

from sklearn import tree
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
y_train_pred = model.predict(X_ros)
y_test_pred = model.predict(test_X)

print(f'Train score {accuracy_score(y_train_pred,y_ros)}')
print(f'Test score {accuracy_score(y_test_pred,severity_test_Y)}')
from sklearn.metrics import confusion_matrix
def plot_confusionmatrix(y_train_pred,y_train,dom):
    print(f'{dom} Confusion matrix')
    cf = confusion_matrix(y_train_pred,y_train)
    sns.heatmap(cf,annot=True,cmap='Blues', fmt='g')
    plt.tight_layout()
    plt.show()
    

plot_confusionmatrix(y_train_pred,y_ros,dom='Train')    
plot_confusionmatrix(y_test_pred,title_test_Y,dom='Test')

#from sklearn.metrics import plot_precision_recall_curve
#plot_precision_recall_curve(model, X_ros, y_ros)


In [None]:
# Logistic Regression (For explainable)
lr_model = LogisticRegression(max_iter=np.inf,solver='sag', class_weight='Balanced')
lr_model.fit(X_ros, y_ros)
predictions2 = lr_model.predict(test_X)
from sklearn import tree
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(lr_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(lr_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(test_Y, predictions2)
#print(cm)
#from sklearn.metrics import r2_score
#print(r2_score(test_Y, predictions2)

from sklearn.metrics import plot_precision_recall_curve
plot_precision_recall_curve(lr_model, X_ros, y_ros)



In [None]:
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

std_slc = StandardScaler()
pca = decomposition.PCA()
logistic_Reg = linear_model.LogisticRegression(max_iter=np.inf)

pipe = Pipeline(steps=[('std_slc', std_slc),
                           ('pca', pca),
                           ('logistic_Reg', logistic_Reg)])
n_components = list(range(1,X.shape[1]+1,1))

C = np.logspace(-4, 4, 50)
penalty = ['l2']

parameters = dict(pca__n_components=n_components,
                      logistic_Reg__C=C,
                      logistic_Reg__penalty=penalty)
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X, y)

print('Best Penalty:', clf_GS.best_estimator_.get_params()['logistic_Reg__penalty'])
print('Best C:', clf_GS.best_estimator_.get_params()['logistic_Reg__C'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(clf_GS.best_estimator_.get_params()['logistic_Reg'])

In [None]:
# Logistic Regression (For explainable)
lr_model = LogisticRegression(solver='sag', class_weight='balanced')
lr_model.fit(X_ros, y_ros)
predictions2 = lr_model.predict(test_X)
from sklearn import tree
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(lr_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(lr_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(test_Y, predictions2)
#print(cm)
#from sklearn.metrics import r2_score
#print(r2_score(test_Y, predictions2))


In [None]:
# Naive Bayes Algorithm (Since the dataset is a bit too large too)
nb_model = GaussianNB()
nb_model.fit(X_ros,y_ros)
from sklearn import tree
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
scores = cross_val_score(nb_model, X_ros, y_ros, scoring='accuracy', cv=kfold, n_jobs=-1)
print('Precision: %.3f' % mean(cross_val_score(nb_model, X_ros, y_ros, scoring='precision', cv=kfold, n_jobs=-1)))
print('Recall: %.3f' % mean(cross_val_score(nb_model, X_ros, y_ros, scoring='recall', cv=kfold, n_jobs=-1)))
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

predictions3 = nb_model.predict(test_X)
#print(accuracy_score(predictions3,test_Y))
#print(precision_score(predictions3,test_Y))
#print(recall_score(predictions3,test_Y))
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(test_Y, predictions3)
#print(cm)
#from sklearn.metrics import r2_score
#print(r2_score(test_Y, predictions3))
from sklearn.metrics import plot_precision_recall_curve
plot_precision_recall_curve(nb_model, X_ros, y_ros)

In [None]:
from sklearn.metrics import roc_curve

# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(test_Y, y_test_pred, pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(test_Y, predictions2, pos_label=1)
fpr3, tpr3, thresh3 = roc_curve(test_Y, predictions3, pos_label=1)
# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(test_Y))]
p_fpr, p_tpr, _ = roc_curve(test_Y, random_probs, pos_label=1)

from sklearn.metrics import roc_auc_score

# auc scores
auc_score1 = roc_auc_score(test_Y, y_test_pred)
auc_score2 = roc_auc_score(test_Y, predictions2)
auc_score3 = roc_auc_score(test_Y, predictions3)

plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Decision Tree')
plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Logistic Regression')
plt.plot(fpr3, tpr3, linestyle='--',color='yellow', label='Naive Bayes')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC.pdf',dpi=300)
plt.show();