In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

print(np.__version__)
print(pd.__version__)
import sys
print(sys.version)
print(sns.__version__)

In [None]:
df = pd.read_csv("D:\\repos\\APDSProject\\MLTuning\\datacleanup\\cvss_final_dataset.csv")
df.head()
df.shape
#df.info()
#df.columns



# Remove Unwanted Columns

In [None]:

remove_list = ['timestamp','data_type','data_format', 'data_version','data_meta_ASSIGNER','cvssV3_version','cvssV3_vectorString','cvssV2_version','cvssV2_vectorString']

df.drop(remove_list,axis = 1, inplace = True)
df.info
df.shape




In [None]:
df.columns

# Prepare CVSS3 Data Frame




In [None]:
df.columns
cvss2_features = ['cvssV2_accessVector', 'cvssV2_accessComplexity',
       'cvssV2_authentication', 'cvssV2_confidentialityImpact',
       'cvssV2_integrityImpact', 'cvssV2_availabilityImpact',
       'cvssV2_baseScore', 'baseMetricV2_severity',
       'baseMetricV2_exploitabilityScore', 'baseMetricV2_impactScore',
       'baseMetricV2_acInsufInfo', 'baseMetricV2_obtainAllPrivilege',
       'baseMetricV2_obtainUserPrivilege', 'baseMetricV2_obtainOtherPrivilege',
       'baseMetricV2_userInteractionRequired', ]

cvss2_df = df[cvss2_features]

cvss2_df.columns
cvss2_df.info
cvss2_df.shape


# Extract the numerical and categorical columns from CVSS Version 3 Featue Set

In [None]:
cvssv2_numerical_col = list(cvss2_df.describe().columns)
cvssv2_categorical_col = list(set(cvss2_df.columns).difference(cvssv2_numerical_col))

cvssv2_numerical_col


In [None]:
cvssv2_categorical_col

# Prepare the features for modelling

In [None]:

cvssv2_model_df= cvss2_df[cvssv2_categorical_col]
cvssv2_model_df.shape

#cvssv3_model_df.info

In [None]:

cvss2_remove_list = ['baseMetricV2_userInteractionRequired','baseMetricV2_obtainAllPrivilege','baseMetricV2_obtainOtherPrivilege','baseMetricV2_obtainUserPrivilege','baseMetricV2_acInsufInfo','baseMetricV2_severity']

cvssv2_model_df.drop(cvss2_remove_list,axis = 1, inplace = True)



In [None]:
cvssv2_model_df.shape

In [None]:
cvssv2_model_df.isnull().any()

# Perform One Hot Encoding
# https://towardsdatascience.com/encoding-categorical-features-21a2651a065c

In [None]:
cvssv2_model_encoded_df = pd.get_dummies(cvssv2_model_df)


In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
cvssv2_model_encoded_df.describe()
cvssv2_model_encoded_df.shape

In [None]:
#X = pd.concat([df['cvssV3_baseScore'], df_v3_model], axis=1)
X = cvssv2_model_encoded_df
X.shape
#X.columns

In [None]:
cvssv2_model_df.columns

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
cvssv2_model_df.describe

# Prepare the features with basic categorization

In [None]:
cvssv2_model_df['cvssV2_availabilityImpact'].unique()
cvssv2_model_df['cvssV2_authentication'].unique()
cvssv2_model_df['cvssV2_integrityImpact'].unique()

In [None]:
def calculateSeverityScore(row):
    if ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
         (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")  and 
         (row['cvssV2_accessComplexity'] == "HIGH" or row['cvssV2_accessComplexity'] == "MEDIUM")  and 
         (row['cvssV2_accessVector'] == "NETWORK" or row['cvssV2_accessVector'] == "REMOTE")  and 
         (row['cvssV2_integrityImpact'] == "COMPLETE") ):
        return 'CRITICAL'
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")  and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_accessVector'] == "NETWORK" or row['cvssV2_accessVector'] == "REMOTE")  and 
            (row['cvssV2_authentication'] == "NONE") and
            (row['cvssV2_integrityImpact'] == "COMPLETE") or row['cvssV2_integrityImpact'] == "REMOTE" ):
        return 'CRITICAL'    
    elif (  (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")  and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_authentication'] == "NETWORK" or row['cvssV2_authentication'] == "REMOTE") ):
        return 'CRITICAL'
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")  and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_accessVector'] == "NETWORK" or row['cvssV2_accessVector'] == "REMOTE")  and 
            (row['cvssV2_authentication'] == "NONE") and
            (row['cvssV2_integrityImpact'] == "NONE")):
        return 'CRITICAL'
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE")  and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_accessVector'] == "NETWORK" or row['cvssV2_accessVector'] == "REMOTE")  and 
            (row['cvssV2_authentication'] == "NONE")):
        return 'CRITICAL'
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE")  and 
            (row['cvssV2_accessComplexity'] == "HIGH" or row['cvssV2_accessComplexity'] == "MEDIUM") and 
            (row['cvssV2_accessVector'] == "NETWORK" or row['cvssV2_accessVector'] == "REMOTE") ):
        return 'CRITICAL' 
    elif ( (row['cvssV2_availabilityImpact'] == "NONE")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE")):
        return 'CRITICAL'  


    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_accessComplexity'] == "HIGH" or row['cvssV2_accessComplexity'] == "MEDIUM")  and 
            (row['cvssV2_accessVector'] == "LOCAL")  and 
            (row['cvssV2_integrityImpact'] == "COMPLETE")):   
        return 'HIGH'              
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_accessComplexity'] == "HIGH" or row['cvssV2_accessComplexity'] == "MEDIUM")  and 
            (row['cvssV2_integrityImpact'] == "NONE" or row['cvssV2_integrityImpact'] == "PARTIAL") and 
            (row['cvssV2_accessVector'] == "LOCAL" or row['cvssV2_accessVector'] == "NETWORK")):   
        return 'HIGH'
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_accessComplexity'] == "HIGH" or row['cvssV2_accessComplexity'] == "MEDIUM")  and 
            (row['cvssV2_integrityImpact'] == "NONE" or row['cvssV2_integrityImpact'] == "PARTIAL") and 
            (row['cvssV2_accessVector'] == "REMOTE")):   
        return 'HIGH'        
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_authentication'] == "MULTIPLE" or row['cvssV2_authentication'] == "SINGLE")):   
        return 'HIGH'        
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "COMPLETE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_authentication'] == "NONE") and 
            (row['cvssV2_accessVector'] == "REMOTE" or row['cvssV2_accessVector'] == "NETWORK") and
            (row['cvssV2_integrityImpact'] == "NONE")): 
        return 'HIGH'
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE")   and 
            (row['cvssV2_accessVector'] == "REMOTE" or row['cvssV2_accessVector'] == "LOCAL")): 
        return 'HIGH'        
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE")   and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_authentication'] == "SINGLE") and 
            (row['cvssV2_accessVector'] == "REMOTE")): 
        return 'HIGH'        
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE")   and 
            (row['cvssV2_accessComplexity'] == "LOW" or row['cvssV2_accessComplexity'] == "MEDIUM" )  and 
            (row['cvssV2_authentication'] == "NONE") and 
            (row['cvssV2_accessVector'] == "REMOTE")):        
        return 'HIGH'
    elif ( (row['cvssV2_availabilityImpact'] == "COMPLETE" or row['cvssV2_availabilityImpact'] == "PARTIAL")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE")   and 
            (row['cvssV2_accessComplexity'] == "LOW" or row['cvssV2_accessComplexity'] == "MEDIUM" )  and 
            (row['cvssV2_authentication'] == "SINGLE") and 
            (row['cvssV2_accessVector'] == "REMOTE")):        
        return 'HIGH'        
    elif ( (row['cvssV2_availabilityImpact'] == "NONE")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_integrityImpact'] == "COMPLETE" or row['cvssV2_integrityImpact'] == "NONE") and 
            (row['cvssV2_accessComplexity'] == "LOW" or row['cvssV2_accessComplexity'] == "MEDIUM" )  and 
            (row['cvssV2_authentication'] == "NONE") and 
            (row['cvssV2_accessVector'] == "REMOTE")):        
        return 'HIGH'              
    elif ( (row['cvssV2_availabilityImpact'] == "NONE")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_integrityImpact'] == "PARTIAL") and 
            (row['cvssV2_accessComplexity'] == "LOW" or row['cvssV2_accessComplexity'] == "MEDIUM" )  and 
            (row['cvssV2_authentication'] == "NONE") and 
            (row['cvssV2_accessVector'] == "REMOTE")):        
        return 'HIGH'          
    elif ( (row['cvssV2_availabilityImpact'] == "NONE")  and 
            (row['cvssV2_confidentialityImpact'] == "NONE" or row['cvssV2_confidentialityImpact'] == "PARTIAL")   and 
            (row['cvssV2_integrityImpact'] == "PARTIAL") and 
            (row['cvssV2_accessComplexity'] == "LOW")  and 
            (row['cvssV2_authentication'] == "SINGLE")):        
        return 'HIGH'        
    else:
        return 'LOW'

#severity = calculateSeverityScore(cvss3_df.iloc[0]['cvssV3_baseScore'])
#severity

cvssv2_model_df['Severity_Score'] = cvssv2_model_df.apply(lambda row: calculateSeverityScore(row),axis=1)


In [None]:
cvssv2_model_df['Severity_Score'].unique()
cvssv2_model_df.Severity_Score.value_counts()


In [None]:
severity_to_num = {'CRITICAL': 1,
                    'HIGH': 2,
                    'LOW': 3}
cvssv2_model_df['Severity_Score_Num'] = cvssv2_model_df['Severity_Score'].map(severity_to_num)

In [None]:
cvssv2_model_df['Severity_Score_Num'].unique()


In [None]:
cvssv2_model_df.rename(columns = {'Severity_Score':'Severity_Score_Text'}, inplace = True)


In [None]:
cvssv2_model_df.columns

In [None]:
cvssv2_model_df.rename(columns = {'Severity_Score_Num':'Severity_Score'}, inplace = True)
Y = cvssv2_model_df['Severity_Score']
Y.head


In [None]:
X.to_csv("features_v2.csv", index=False, encoding='utf8')


In [None]:
Y.to_csv("classes_v2.csv",index=False,encoding='utf8')

In [None]:
X.head

In [None]:
Y.head

# Split the data set into 70% training and 30% test 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) # 70% training and 30% test

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
clf.fit(X_train, y_train)



In [None]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, 
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train, 
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, 
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train), 
                                                      lb.transform(res))))

        #res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        #print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        #print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, 
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test, 
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, 
                                                                  res_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), 
                                                      lb.transform(res_test))))

In [100]:
#print_score(clf, X_train, X_test, y_train, y_test, train=True)
print_score(clf, X_train, X_test, y_train, y_test, train=False)

Test Result:

accuracy score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      1345
           2       1.00      1.00      1.00      1450
           3       1.00      1.00      1.00      3917

    accuracy                           1.00      6712
   macro avg       1.00      1.00      1.00      6712
weighted avg       1.00      1.00      1.00      6712


Confusion Matrix: 
 [[1345    0    0]
 [   0 1450    0]
 [   0    0 3917]]

ROC AUC: 1.0000



In [None]:
# The score method returns the accuracy of the model
score = clf.score(X_test, y_test)
print(score)

In [None]:
max_depth_range = list(range(1, 8))# List to store the average RMSE for each value of max_depth:
accuracy = []
for depth in max_depth_range:
    
    clf = DecisionTreeClassifier(max_depth = depth, 
                             random_state = 0)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    accuracy.append(score)
    #print(score)
accuracy

In [None]:
tree.export_graphviz(clf.tree_, out_file='tree.dot', feature_names=list(X.columns))

In [None]:
from sklearn.tree.export import export_text
tree_rules = export_text(clf, feature_names = list(X.columns))
tree_rules

In [None]:
from sklearn import tree
plt.figure(figsize=(40,20))  # customize according to the size of your tree
_ = tree.plot_tree(clf, feature_names = X.columns)
plt.show()

In [92]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
threshold = clf.tree_.threshold
n_nodes



55

In [93]:
children_left


array([ 1,  2,  3,  4,  5,  6, -1, -1,  9, -1, -1, 12, -1, -1, 15, 16, 17,
       -1, 19, -1, -1, 22, 23, -1, -1, -1, 27, 28, -1, -1, 31, 32, -1, -1,
       -1, 36, 37, -1, -1, 40, -1, 42, 43, -1, -1, -1, 47, 48, -1, 50, -1,
       52, -1, -1, -1], dtype=int64)

In [None]:
children_right


In [None]:
feature


In [None]:
threshold

In [94]:
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("The binary tree structure has %s nodes and has "
      "the following tree structure:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%snode=%s leaf node." % (node_depth[i] * "\t", i))
    else:
        print("%snode=%s test node: go to node %s if X[:, %s] <= %s else to "
              "node %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature[i],
                 threshold[i],
                 children_right[i],
                 ))
print()

# First let's retrieve the decision path of each sample. The decision_path
# method allows to retrieve the node indicator functions. A non zero element of
# indicator matrix at the position (i, j) indicates that the sample i goes
# through the node j.

node_indicator = clf.decision_path(X_test)

# Similarly, we can also have the leaves ids reached by each sample.

leave_id = clf.apply(X_test)

# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[sample_id]:
                                    node_indicator.indptr[sample_id + 1]]

print('Rules used to predict sample %s: ' % sample_id)
for node_id in node_index:
    if leave_id[sample_id] == node_id:
        continue

    if (X_test[sample_id, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)"
          % (node_id,
             sample_id,
             feature[node_id],
             X_test[sample_id, feature[node_id]],
             threshold_sign,
             threshold[node_id]))

# For a group of samples, we have the following common node.
sample_ids = [0, 1]
common_nodes = (node_indicator.toarray()[sample_ids].sum(axis=0) ==
                len(sample_ids))

common_node_id = np.arange(n_nodes)[common_nodes]

print("\nThe following samples %s share the node %s in the tree"
      % (sample_ids, common_node_id))
print("It is %s %% of all nodes." % (100 * len(common_node_id) / n_nodes,))

The binary tree structure has 55 nodes and has the following tree structure:
node=0 test node: go to node 1 if X[:, 10] &lt;= 0.5 else to node 46.
	node=1 test node: go to node 2 if X[:, 5] &lt;= 0.5 else to node 35.
		node=2 test node: go to node 3 if X[:, 8] &lt;= 0.5 else to node 14.
			node=3 test node: go to node 4 if X[:, 4] &lt;= 0.5 else to node 11.
				node=4 test node: go to node 5 if X[:, 16] &lt;= 0.5 else to node 8.
					node=5 test node: go to node 6 if X[:, 6] &lt;= 0.5 else to node 7.
						node=6 leaf node.
						node=7 leaf node.
					node=8 test node: go to node 9 if X[:, 13] &lt;= 0.5 else to node 10.
						node=9 leaf node.
						node=10 leaf node.
				node=11 test node: go to node 12 if X[:, 6] &lt;= 0.5 else to node 13.
					node=12 leaf node.
					node=13 leaf node.
			node=14 test node: go to node 15 if X[:, 14] &lt;= 0.5 else to node 26.
				node=15 test node: go to node 16 if X[:, 2] &lt;= 0.5 else to node 21.
					node=16 test node: go to node 17 if X[:, 12]

KeyError: (0, 10)

In [96]:
#Y['CVE_ID'] = df['data_meta_ID']
#Y.index = df['data_meta_ID']
classes = pd.DataFrame(Y)
#Y['CVE_ID'] = df['data_meta_ID']
#Y.size
classes['CVE_ID'] = df['data_meta_ID']
classes.head
classes.to_csv("classes_v2.csv",index=False,encoding='utf8')


In [97]:
ynew = clf.predict(X)
ynew.size

67113

In [98]:
classes_new = pd.DataFrame(ynew)
#Y['CVE_ID'] = df['data_meta_ID']
#Y.size
classes_new['CVE_ID'] = df['data_meta_ID']
classes_new.head
classes_new.to_csv("classes_v2_new.csv",index=False,encoding='utf8')