In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

print(np.__version__)
print(pd.__version__)
import sys
print(sys.version)
print(sns.__version__)

In [None]:
df = pd.read_csv("D:\\repos\\APDSProject\\MLTuning\\datacleanup\\cvss_final_dataset.csv")
df.head()
df.shape
#df.info()
#df.columns



# Remove Unwanted Columns

In [None]:

remove_list = ['timestamp','data_type','data_format', 'data_version','data_meta_ASSIGNER','cvssV3_version','cvssV3_vectorString','cvssV2_version','cvssV2_vectorString']

df.drop(remove_list,axis = 1, inplace = True)
df.info
df.shape



# Prepare CVSS3 Data Frame




In [None]:
df.columns
cvss3_features = ['cvssV3_attackVector', 'cvssV3_attackComplexity', 'cvssV3_privilegesRequired', 'cvssV3_userInteraction', 'cvssV3_scope','cvssV3_confidentialityImpact', 'cvssV3_integrityImpact','cvssV3_availabilityImpact', 'cvssV3_baseScore', 'cvssV3_baseSeverity',      'baseMetricV3_exploitabilityScore', 'baseMetricV3_impactScore', ]

cvss3_df = df[cvss3_features]

cvss3_df.columns
cvss3_df.info
cvss3_df.shape


# Extract the numerical and categorical columns from CVSS Version 3 Featue Set

In [None]:
cvssv3_numerical_col = list(cvss3_df.describe().columns)
cvssv3_categorical_col = list(set(cvss3_df.columns).difference(cvssv3_numerical_col))

cvssv3_numerical_col


# Prepare the features for modelling

In [None]:
cvssv3_categorical_col.remove('cvssV3_baseSeverity')
cvssv3_categorical_col


In [None]:

cvssv3_model_df= cvss3_df[cvssv3_categorical_col]
cvssv3_model_df.shape

#cvssv3_model_df.info

In [None]:
cvssv3_model_df.isnull().any()

# Perform One Hot Encoding
# https://towardsdatascience.com/encoding-categorical-features-21a2651a065c

In [None]:
cvssv3_model_encoded_df = pd.get_dummies(cvssv3_model_df[cvssv3_categorical_col])


In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
cvssv3_model_encoded_df.describe()
cvssv3_model_encoded_df.shape

In [None]:
#X = pd.concat([df['cvssV3_baseScore'], df_v3_model], axis=1)
X = cvssv3_model_encoded_df
X.shape
#X.columns

# Prepare the features with basic categorization

In [None]:


CWE_df = pd.read_csv("D:\\repos\\APDSProject\\MLTuning\\datacollection\\CVSS-Base.csv")

CWE_df_new = CWE_df[CWE_df['Consequences'].notnull()]

cvssv3_model_df['Consequences'] = CWE_df['Consequences']

cvssv3_model_df.shape

In [None]:
cvssv3_model_df['Consequences'].unique

[('DoS', 373),
 ('Bypass Protection Mechanism', 286),
 ('Read Application Data', 282),
 ('Execute Unauthorized Code or Commands', 248),
 ('Gain Privileges or Assume Identity', 234),
 (' Crash, Exit, or Restart', 181),
 ('Modify Memory', 172),
 ('Modify Application Data', 163),
 ('Varies by Context', 150),
 ('Unexpected State', 146),
 ('Read Memory', 129),
 ('Read Files or Directories', 100),
 ('Modify Files or Directories', 87),
 ('LIKELIHOOD', 86),
 ('Alter Execution Logic', 81),
 ('Reduce Maintainability', 77),
 ('Quality Degradation', 75),
 ('Hide Activities', 60),
 (' Resource Consumption (CPU)', 57),
 ('High', 57),
 (' Resource Consumption (Other)', 51),
 ('Reduce Reliability', 45),
 ('Authorization', 39),
 (' Resource Consumption (Memory)', 39),
 ('Reduce Performance', 36),
 (' Instability', 32),
 ('Accountability', 27),
 ('HighSCOPE', 14),
 ('Read MemorySCOPE', 10),
 (' Amplification', 9)]

In [None]:
def checkConsequence(severity):
   try:
      if (
             #Stability
            ((severity.find(' Crash, Exit, or Restart') != -1) and
             (severity.find('Instability') != -1)) or
            
            #Access
            ((severity.find('Read Files or Directories') != -1) and 
             (severity.find('Modify Files or Directories') != -1))  or 

            #Authorization
             ((severity.find('Execute Unauthorized Code or Commands') != -1) and 
             (severity.find('Gain Privileges or Assume Identity') != -1))
         ):
         return 1
      else:
         return 0   
   except AttributeError:
      return 0
      
cvssv3_model_df['Super_Severity_Score'] = cvssv3_model_df.apply(lambda row: checkConsequence(row['Consequences']),axis=1)
    

In [None]:
cvssv3_model_df['Super_Severity_Score'].value_counts()
#cvssv3_model_df.shape

In [None]:
X['Super_Severity_Score'] = cvssv3_model_df['Super_Severity_Score']

In [None]:
X.shape
#X.columns

In [None]:
def calculateSeverityScore(row):
    if (row['cvssV3_attackVector'] == "NETWORK" and (row['cvssV3_availabilityImpact'] == "HIGH" or row['cvssV3_availabilityImpact'] == "HIGH" ) and row['cvssV3_integrityImpact'] == "HIGH" and row['cvssV3_privilegesRequired'] == "NONE" and row['cvssV3_userInteraction'] == "NONE"):
        return 'CRITICAL'
    elif (row['cvssV3_attackVector'] == 'NETWORK' and row['cvssV3_availabilityImpact'] == 'HIGH' and row['cvssV3_integrityImpact'] == 'HIGH' and row['cvssV3_privilegesRequired'] == 'NONE' and row['cvssV3_userInteraction'] == 'REQUIRED'):
        return 'HIGH'
    elif (row['cvssV3_attackVector'] == 'NETWORK' and row['cvssV3_availabilityImpact'] == 'HIGH' and row['cvssV3_integrityImpact'] == 'HIGH' and row['cvssV3_privilegesRequired'] != 'NONE'):
        return 'HIGH'
    elif (row['cvssV3_attackVector'] == 'NETWORK' and row['cvssV3_integrityImpact'] == 'HIGH' and row['cvssV3_privilegesRequired'] == 'NONE'):
        return 'HIGH'
    elif (row['cvssV3_attackVector'] == 'NETWORK' and row['cvssV3_integrityImpact'] == 'HIGH' and row['cvssV3_privilegesRequired'] != 'NONE'):
        return 'HIGH'
    elif (row['cvssV3_availabilityImpact'] != 'HIGH' and row['cvssV3_confidentialityImpact'] != 'LOW' and row['cvssV3_userInteraction'] == 'NONE' and row['cvssV3_integrityImpact'] != 'HIGH' and row['cvssV3_privilegesRequired'] == 'NONE'):
        return 'HIGH'
    elif (row['cvssV3_confidentialityImpact'] != 'LOW' and row['cvssV3_userInteraction'] == 'REQUIRED' and row['cvssV3_integrityImpact'] != 'HIGH' and row['cvssV3_privilegesRequired'] == 'NONE'):
        return 'LOW'        
    elif (row['cvssV3_confidentialityImpact'] == 'LOW' and row['cvssV3_integrityImpact'] != 'HIGH' and row['cvssV3_privilegesRequired'] == 'NONE'):
        return 'LOW'         
    elif (row['cvssV3_confidentialityImpact'] != 'LOW' and row['cvssV3_integrityImpact'] != 'HIGH' and row['cvssV3_privilegesRequired'] != 'NONE'):
        return 'LOW'            
    elif (row['cvssV3_confidentialityImpact'] != 'LOW' and row['cvssV3_integrityImpact'] != 'HIGH' and row['cvssV3_privilegesRequired'] != 'NONE'):
        return 'LOW'
    else:
        return 'LOW'

#severity = calculateSeverityScore(cvss3_df.iloc[0]['cvssV3_baseScore'])
#severity

cvssv3_model_df['Severity_Score'] = cvssv3_model_df.apply(lambda row: calculateSeverityScore(row),axis=1)


In [None]:
def updateSeverity(row):
   try:
        if (row['Severity_Score'] == "CRITICAL" and row['Super_Severity_Score'] == 1):
                return 'CRITICAL'
        elif (row['Severity_Score'] == "CRITICAL" and row['Super_Severity_Score'] == 0):
                return 'HIGH'  
        else:
            return  row['Severity_Score']
   except AttributeError:
      return 0      
cvssv3_model_df['New_Severity_Score'] = cvssv3_model_df.apply(lambda row: updateSeverity(row),axis=1)

In [None]:
severity_to_num = {'CRITICAL': 1,
                    'HIGH': 2,
                    'LOW': 3}
cvssv3_model_df['Severity_Score_Num'] = cvssv3_model_df['New_Severity_Score'].map(severity_to_num)

In [None]:
cvssv3_model_df['Severity_Score_Num'].unique()


In [None]:
cvssv3_model_df.rename(columns = {'New_Severity_Score':'Severity_Score_Text'}, inplace = True)


In [None]:
cvssv3_model_df.columns

In [None]:
cvssv3_model_df.rename(columns = {'Severity_Score_Num':'New_Severity_Score'}, inplace = True)
Y = cvssv3_model_df['New_Severity_Score']
Y.shape



In [None]:
X.to_csv("features.csv", index=False, encoding='utf8')


In [None]:
Y.to_csv("classes.csv",index=False,encoding='utf8')

In [None]:
X.head
X.columns

In [None]:
Y.head

# Split the data set into 70% training and 30% test 

In [None]:
import pandas_profiling
X_train.describe().T
X_test.describe().T
pandas_profiling.ProfileReport(X_train)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) # 70% training and 30% test



In [None]:
import numpy as np

# Create correlation matrix
corr_matrix = X_train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.70)]


# Drop features 
X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop,axis=1, inplace=True)
X.drop(to_drop,axis=1, inplace=True)

to_drop

In [None]:
X_train.shape

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy",splitter="best",max_depth=7,random_state=42)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)



In [None]:
node_indicator = clf.decision_path(X_train)
node_indicator


In [None]:
n_nodes = clf.tree_.node_count
n_nodes

In [None]:
node_indicator = clf.decision_path(X_train)
n_nodes = clf.tree_.node_count
feature = clf.tree_.feature
threshold = clf.tree_.threshold
leave_id = clf.apply(X_train)


def value2prob(value):
    return value / value.sum(axis=1).reshape(-1, 1)


def print_condition(sample_id):
    print("WHEN", end=' ')
    node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                        node_indicator.indptr[sample_id + 1]]
    
    for n, node_id in enumerate(node_index):
        if leave_id[sample_id] == node_id:
            values = clf.tree_.value[node_id]
            probs = value2prob(values)
            print('THEN Y={} (probability={}) (values={})'.format(
                probs.argmax(), probs.max(), values))
            continue
        if n > 0:
            print('&& ', end='')
        if (X_train[sample_id, feature[node_id]] <= threshold[node_id]):
            threshold_sign = "<="
        else:
            threshold_sign = ">"
        if feature[node_id] != _tree.TREE_UNDEFINED:
            print(
                "%s %s %s" % (
                    feature_names[feature[node_id]],
                    #Xtrain[sample_id,feature[node_id]] # actual value
                    threshold_sign,
                    threshold[node_id]),
                end=' ')

In [None]:
print_condition(2)

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, 
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train, 
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, 
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train), 
                                                      lb.transform(res))))

        #res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        #print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        #print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, 
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test, 
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, 
                                                                  res_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), 
                                                      lb.transform(res_test))))

In [None]:
print_score(clf, X_train, X_test, y_train, y_test, train=True)
print_score(clf, X_train, X_test, y_train, y_test, train=False)

In [None]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
n_nodes



In [None]:
# The score method returns the accuracy of the model
score = clf.score(X_test, y_test)
print(score)

In [None]:
max_depth_range = list(range(1, 10))# List to store the average RMSE for each value of max_depth:
accuracy = []
for depth in max_depth_range:
    
    clf = DecisionTreeClassifier(max_depth = depth, 
                             random_state = 0)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    accuracy.append(score)
    #print(score)
accuracy

In [None]:
from sklearn.tree.export import export_text
tree_rules = export_text(clf, feature_names = list(X.columns))
tree_rules

In [None]:
def get_code(tree, feature_names):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]
        value = tree.tree_.value

        def recurse(left, right, threshold, features, node):
                if (threshold[node] != -2):
                        print("if ( " + features[node] + " <= " + str(threshold[node]) + " ) {")
                        if left[node] != -1:
                                recurse (left, right, threshold, features,left[node])
                        print("} else {")
                        if right[node] != -1:
                                recurse (left, right, threshold, features,right[node])
                        print("}")
                else:
                        print("return " + str(value[node]))

        recurse(left, right, threshold, features, 0)



In [None]:
get_code(clf,X.columns)

In [None]:
from skompiler import skompile
skompile(clf.predict).to('python/code')

In [None]:
import numpy as np
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [feature_names[i] 
                    if i != _tree.TREE_UNDEFINED else "undefined!" 
                    for i in tree_.feature]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "    " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, np.argmax(tree_.value[node])))

    recurse(0, 1)

In [None]:
tree_to_code(clf,X.columns)

In [None]:
def print_decision_tree(tree, feature_names=None, offset_unit='    '):
    '''Plots textual representation of rules of a decision tree
    tree: scikit-learn representation of tree
    feature_names: list of feature names. They are set to f1,f2,f3,... if not specified
    offset_unit: a string of offset of the conditional block'''

    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    value = tree.tree_.value
    if feature_names is None:
        features  = ['f%d'%i for i in tree.tree_.feature]
    else:
        features  = [feature_names[i] for i in tree.tree_.feature]        

    def recurse(left, right, threshold, features, node, depth=0):
            offset = offset_unit*depth
            if (threshold[node] != -2):
                    print(offset+"if ( " + features[node] + " <= " + str(threshold[node]) + " ) {")
                    if left[node] != -1:
                            recurse (left, right, threshold, features,left[node],depth+1)
                    print(offset+"} else {")
                    if right[node] != -1:
                            recurse (left, right, threshold, features,right[node],depth+1)
                    print(offset+"}")
            else:
                    print(offset+"return " + str(value[node]))

    recurse(left, right, threshold, features, 0,0)

In [None]:
print_decision_tree(clf,X.columns)

In [None]:
from sklearn import tree
plt.figure(figsize=(40,20))  # customize according to the size of your tree
_ = tree.plot_tree(clf, feature_names = X.columns)
plt.show()

In [None]:
children_left


In [None]:
children_right


In [None]:
feature


In [None]:
threshold

In [None]:
#Y['CVE_ID'] = df['data_meta_ID']
#Y.index = df['data_meta_ID']
classes = pd.DataFrame(Y)
#Y['CVE_ID'] = df['data_meta_ID']
#Y.size
classes['CVE_ID'] = df['data_meta_ID']
classes.head
classes.to_csv("classes.csv",index=False,encoding='utf8')


In [None]:
#X.drop(to_drop,axis=1, inplace=True)
X.shape
#X.columns

In [None]:
ynew = clf.predict(X)
ynew.size

In [None]:
classes_new = pd.DataFrame(ynew)
#Y['CVE_ID'] = df['data_meta_ID']
#Y.size
classes_new['CVE_ID'] = df['data_meta_ID']
classes_new.head
classes_new.to_csv("classes_new.csv",index=False,encoding='utf8')

In [None]:


classes_new.rename(columns = {0:'Severity_Score'}, inplace = True)
classes_new.columns
classes_new['Severity_Score'].value_counts()



In [None]:
vendor = pd.read_csv("D:\\repos\\APDSProject\\MLTuning\\datacollection\\vendor_cve_map.csv")
vendor.rename(columns = {'CVE ID':'data_meta_ID'}, inplace = True)
vendor.columns

In [None]:
df.columns

In [None]:
#vendor = pd.read_csv("D:\\repos\\APDSProject\\MLTuning\\datacleanup\\vendor_cve_map.csv")

df_outer = pd.merge(df, vendor, on='data_meta_ID', how='left')


In [None]:
df_outer.drop_duplicates(subset=['data_meta_ID'])
df_outer.shape

In [None]:
df_outer['Vendor'].head
#df_outer['# of Exploits'].value_counts()
# of Exploits