In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

print(np.__version__)
print(pd.__version__)
import sys
print(sys.version)
print(sns.__version__)

In [None]:
df = pd.read_csv("D:\\repos\\APDSProject\\MLTuning\\datacleanup\\cvss_final_dataset.csv")
df.head()
df.shape
#df.info()
#df.columns



# Remove Unwanted Columns

In [None]:

remove_list = ['timestamp','data_type','data_format', 'data_version','data_meta_ASSIGNER','cvssV3_version','cvssV3_vectorString','cvssV2_version','cvssV2_vectorString']

df.drop(remove_list,axis = 1, inplace = True)
df.info
df.shape



# Prepare CVSS3 Data Frame




In [None]:
df.columns
cvss3_features = ['cvssV3_attackVector', 'cvssV3_attackComplexity', 'cvssV3_privilegesRequired', 'cvssV3_userInteraction', 'cvssV3_scope','cvssV3_confidentialityImpact', 'cvssV3_integrityImpact','cvssV3_availabilityImpact', 'cvssV3_baseScore', 'cvssV3_baseSeverity',      'baseMetricV3_exploitabilityScore', 'baseMetricV3_impactScore', ]

cvss3_df = df[cvss3_features]

cvss3_df.columns
cvss3_df.info
cvss3_df.shape


# Extract the numerical and categorical columns from CVSS Version 3 Featue Set

In [None]:
cvssv3_numerical_col = list(cvss3_df.describe().columns)
cvssv3_categorical_col = list(set(cvss3_df.columns).difference(cvssv3_numerical_col))

cvssv3_numerical_col


In [None]:
cvss3_df[cvssv3_numerical_col].corr()
plt.figure(figsize=(24,8))
sns.heatmap(cvss3_df[cvssv3_numerical_col].corr(), annot=True, fmt=".2f");

In [None]:
cvssv3_categorical_col

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
cvss3_df.describe()
cvss3_df.info()

# Prepare the features for modelling

In [None]:

cvssv3_model_df= cvss3_df[cvssv3_categorical_col]
cvssv3_model_df.columns

cvssv3_model_df.info

In [None]:
cvssv3_model_df.isnull().any()

# Perform One Hot Encoding
# https://towardsdatascience.com/encoding-categorical-features-21a2651a065c

In [None]:
df_v3_model = pd.get_dummies(cvssv3_model_df[cvssv3_categorical_col])


In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df_v3_model.describe()
df_v3_model.info

In [None]:
plt.figure(figsize=(24,8))
sns.heatmap(df_v3_model.corr(), annot=True, fmt=".2f");

In [None]:
#X = pd.concat([df['cvssV3_baseScore'], df_v3_model], axis=1)
X = df_v3_model
X.head()

In [None]:
plt.figure(figsize=(24,8))

# Mask for the upper triangle
mask = np.zeros_like(X.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Generate a custom diverging colormap
#cmap = sns.diverging_palette(10, 220, as_cmap=True)
cmap = sns.light_palette((210, 90, 60), input="husl")

# Heatmap with mask
sns.heatmap(X.corr(), mask=mask, cmap=cmap, annot=True, fmt=".2f");

In [None]:
correlatedColumns = []
corr = X.corr()
indices = corr.index
columns = corr.columns
posthreshold = 0.7
negthreshold = -0.7

for c in columns:
    for r in indices:
        if c != r and (corr[c][r] > posthreshold or corr[c][r] < negthreshold):
            print("column "  + c , "  row " + r + "  val " + str(corr[c][r]) )
            correlatedColumns.append({"column" : c , "row" : r , "val" :corr[c][r] })
            

#print(correlatedColumns)

In [None]:
def calculateSeverityScore(baseScore):
    #baseScore = score
    if baseScore < 10.0 and baseScore > 8.0:
        return 'CRITICAL'
    elif baseScore < 8.0 and baseScore > 6.0:
        return 'HIGH'
    elif baseScore < 6.0 and baseScore > 4.0:
        return 'MEDIUM'
    else:
        return 'LOW'

#severity = calculateSeverityScore(cvss3_df.iloc[0]['cvssV3_baseScore'])
#severity

X['Severity_Score'] = cvss3_df['cvssV3_baseScore'].apply(calculateSeverityScore)


In [None]:
Y = X['Severity_Score']
Y.head


In [None]:
X.drop('Severity_Score',
  axis='columns', inplace=True)
X.head

# Split the data set into 70% training and 30% test 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1) # 70% training and 30% test

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
accuracy_score(y_train, clf.predict(X_train))

In [None]:
print(classification_report(y_train, clf.predict(X_train)))

In [None]:
confusion_matrix(y_train, clf.predict(X_train))

In [None]:
accuracy_score(y_test, clf.predict(X_test))

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
confusion_matrix(y_test, clf.predict(X_test))

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, 
                                                                res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train, 
                                                                            res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, 
                                                                  res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train), 
                                                      lb.transform(res))))

        #res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        #print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        #print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, 
                                                                res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test, 
                                                                            res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, 
                                                                  res_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test), 
                                                      lb.transform(res_test))))

In [None]:
print_score(clf, X_train, X_test, y_train, y_test, train=True)
print_score(clf, X_train, X_test, y_train, y_test, train=False)

In [39]:
from sklearn import tree

text_representation = tree.export_text(clf)
print(text_representation)

TypeError: an integer is required (got type bytes)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clf, 
                   feature_names=list(X.columns),  
                   class_names='Severity_Score',
                   filled=True)

In [None]:
fig.savefig("cvss3_output_tree.png")

In [None]:
import graphviz
# DOT data
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=list(X.columns),  
                                class_names='Severity_Score',
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph
