<a href="https://colab.research.google.com/github/sho6210/SecDev2023/blob/main/MLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [1]:
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from numpy.lib.function_base import vectorize
from sklearn.feature_extraction.text import CountVectorizer
from numpy.core.fromnumeric import size
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import tqdm
import time
import json
import re
import csv
import urllib.request
import shutil

1.2.2


# Data

In [2]:
# Load data
metric = 'cvssV3_attackVector'
df = pd.read_csv("https://raw.githubusercontent.com/sho6210/SecDev2023/main/data/cve_2018-2020_complete_dataset.csv", header=0)

# Make a data set

In [3]:
def DataSetMake(metric):
  X = pd.DataFrame(df[['Description']])
  y = pd.DataFrame(df[[metric]])
  train_sentence, test_sentence, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0, stratify=y)

  # Display data distribution.
  print("----------y_train:", y_train.value_counts())
  print("----------y_test:", y_test.value_counts())

  return train_sentence, test_sentence, y_train, y_test


# Natural Language Processing (train)

In [4]:
def NLP_train(train_sentence):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer
  vectorizer = CountVectorizer(stop_words="english")
  X_train = vectorizer.fit_transform(train_sentence['Description'].values)
  print('dimensions:', X_train.shape)
  
  return X_train, vectorizer

# Multinomial Logistic Regression (train)

In [5]:
def MLR_train(X_train, y_train): 
  # Create a classification model for MLR using vectorized features
  lr = LogisticRegression(C=0.1, random_state=0, n_jobs=-1)
  lr.fit(X_train, y_train)

  return lr



# Natural Language Processing (test)

In [6]:
def NLP_test(test_sentence, vectorizer):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer.
  X_test = vectorizer.transform(test_sentence['Description'].values)
  
  return X_test

# Multinomial Logistic Regression (test)

In [7]:
def MLR_test(metric, X_test, y_test, lr):
  # Test data to confirm accuracy.
  y_pred = lr.predict(X_test)
  cm = confusion_matrix(y_test, y_pred)

  # Branching when creating a table.
  if metric == 'cvssV3_attackVector':
    table = pd.DataFrame(cm, columns=['Predicted A', 'Predicted L', 'Predicted N', 'Predicted P'], index=['Actual A', 'Actual L', 'Actual N', 'Actual P'])
  elif metric == 'cvssV3_attackComplexity':
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L'], index=['Actual H', 'Actual L'])
  elif metric == 'cvssV3_privilegesRequired':
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L', 'Predicted N'], index=['Actual H', 'Actual L', 'Actual N'])
  elif metric == 'cvssV3_userInteraction':
    table = pd.DataFrame(cm, columns=['Predicted N', 'Predicted R'], index=['Actual N', 'Actual R'])
  elif metric == 'cvssV3_scope':
    table = pd.DataFrame(cm, columns=['Predicted C', 'Predicted U'], index=['Actual C', 'Actual U'])
  else:
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L', 'Predicted N'], index=['Actual H', 'Actual L', 'Actual N'])

  # Accuracy
  print("accuracy:", accuracy_score(y_test, y_pred))
  print(classification_report(y_test, y_pred, digits=4))
  print(table)
  print('-'*70)

  # Returns a list containing the prediction results and a data frame for testing.
  return y_pred



## Extraction of input tokens

In [8]:
def ExtractionTokens(vectorizer, lr):
  # Get the name of the feature (word).
  feature_names = vectorizer.get_feature_names_out() 
  
  # Binary time.
  if lr.coef_.shape[0] == 1:
    coef = lr.coef_[0] 

   # Extraction of positively and negatively impacted words.
    positive_words = [feature_names[i] for i in coef.argsort()[-10:]]
    negative_words = [feature_names[i] for i in coef.argsort()[:10]]

    print('Positive words:', positive_words)
    print('Negative words:', negative_words)


  # At three values or more.
  if lr.coef_.shape[0] != 1:

    # 分類区分ごとのポジティブとネガティブな単語の抽出
    coef_matrix = lr.coef_
    for i, class_name in enumerate(lr.classes_):
      coef = coef_matrix[i]
      positive_words = [feature_names[j] for j in coef.argsort()[-10:]]
      negative_words = [feature_names[j] for j in coef.argsort()[:10]]
      
      print("Class:", class_name)
      print("Positive words:", positive_words)
      print("Negative words:", negative_words)
    print('-' * 50)

# Example

In [11]:
df_tmp = pd.DataFrame()
df_result = pd.DataFrame()

metrics = ['cvssV3_attackVector','cvssV3_attackComplexity','cvssV3_privilegesRequired','cvssV3_userInteraction',
           'cvssV3_scope','cvssV3_confidentialityImpact','cvssV3_integrityImpact','cvssV3_availabilityImpact']

for metric in metrics:
  train_sentence, test_sentence, y_train, y_test = DataSetMake(metric)

  X_train, vectorizer = NLP_train(train_sentence)
  lr = MLR_train(X_train, y_train)

  X_test = NLP_test(test_sentence, vectorizer)
  y_pred = MLR_test(metric, X_test, y_test, lr)

  ExtractionTokens(vectorizer, lr)

  df_tmp[metric] = y_test[metric]
  df_tmp[f'pred_{metric}'] = y_pred
  df_result[metric] = df_tmp[metric] == df_tmp[f'pred_{metric}']

print(df_result.value_counts())

----------y_train: cvssV3_attackVector
NETWORK                17045
LOCAL                   5046
ADJACENT_NETWORK         575
PHYSICAL                 297
dtype: int64
----------y_test: cvssV3_attackVector
NETWORK                17045
LOCAL                   5047
ADJACENT_NETWORK         575
PHYSICAL                 296
dtype: int64
dimensions: (22963, 36446)


  y = column_or_1d(y, warn=True)


accuracy: 0.9033227365762313
                  precision    recall  f1-score   support

ADJACENT_NETWORK     0.7989    0.4974    0.6131       575
           LOCAL     0.8613    0.7741    0.8154      5047
         NETWORK     0.9165    0.9643    0.9398     17045
        PHYSICAL     0.8382    0.3851    0.5278       296

        accuracy                         0.9033     22963
       macro avg     0.8537    0.6552    0.7240     22963
    weighted avg     0.9004    0.9033    0.8990     22963

          Predicted A  Predicted L  Predicted N  Predicted P
Actual A          286           30          256            3
Actual L           16         3907         1114           10
Actual N           54          546        16436            9
Actual P            2           53          127          114
----------------------------------------------------------------------
Class: ADJACENT_NETWORK
Positive words: ['authenticated', 'devices', 'segment', 'wireless', 'zte', 'network', 'hyper', 'bluetoot

  y = column_or_1d(y, warn=True)


accuracy: 0.9588903888864695
              precision    recall  f1-score   support

        HIGH     0.8591    0.4483    0.5892      1510
         LOW     0.9624    0.9948    0.9784     21453

    accuracy                         0.9589     22963
   macro avg     0.9108    0.7216    0.7838     22963
weighted avg     0.9556    0.9589    0.9528     22963

          Predicted H  Predicted L
Actual H          677          833
Actual L          111        21342
----------------------------------------------------------------------
Positive words: ['msm', 'html', 'multiple', 'parameter', 'jet', 'cross', 'injection', 'exploitable', 'xss', 'easily']
Negative words: ['race', 'difficult', 'middle', 'man', 'condition', 'mitm', 'timing', 'freertos', 'enabled', 'time']
----------y_train: cvssV3_privilegesRequired
NONE                         14712
LOW                           6490
HIGH                          1761
dtype: int64
----------y_test: cvssV3_privilegesRequired
NONE                      

  y = column_or_1d(y, warn=True)


accuracy: 0.8282889866306667
              precision    recall  f1-score   support

        HIGH     0.7466    0.4617    0.5705      1761
         LOW     0.7786    0.6892    0.7312      6490
        NONE     0.8515    0.9335    0.8906     14712

    accuracy                         0.8283     22963
   macro avg     0.7922    0.6948    0.7308     22963
weighted avg     0.8229    0.8283    0.8210     22963

          Predicted H  Predicted L  Predicted N
Actual H          813          407          541
Actual L          163         4473         1854
Actual N          113          865        13734
----------------------------------------------------------------------
Class: HIGH
Positive words: ['admins', 'hyper', 'netgear', 'admin', 'authenticated', 'administrative', 'administrator', 'administrators', 'high', 'privileged']
Negative words: ['low', 'additional', 'unauthenticated', 'csrf', 'elevate', 'non', 'wordpress', 'crafted', 'unprivileged', 'firefox']
Class: LOW
Positive words: ['ioct

  y = column_or_1d(y, warn=True)


accuracy: 0.9190436789618082
              precision    recall  f1-score   support

        NONE     0.9185    0.9616    0.9395     15019
    REQUIRED     0.9203    0.8386    0.8776      7944

    accuracy                         0.9190     22963
   macro avg     0.9194    0.9001    0.9085     22963
weighted avg     0.9191    0.9190    0.9181     22963

          Predicted N  Predicted R
Actual N        14442          577
Actual R         1282         6662
----------------------------------------------------------------------
Positive words: ['successful', 'human', 'site', 'interaction', 'redirect', 'crafted', 'person', 'cross', 'csrf', 'xss']
Negative words: ['traversal', 'sql', 'packet', 'authenticated', 'ssrf', 'local', 'guest', 'packets', 'kernel', 'upload']
----------y_train: cvssV3_scope
UNCHANGED       19028
CHANGED          3935
dtype: int64
----------y_test: cvssV3_scope
UNCHANGED       19027
CHANGED          3936
dtype: int64
dimensions: (22963, 36193)


  y = column_or_1d(y, warn=True)


accuracy: 0.9610678047293472
              precision    recall  f1-score   support

     CHANGED     0.9557    0.8105    0.8771      3936
   UNCHANGED     0.9620    0.9922    0.9769     19027

    accuracy                         0.9611     22963
   macro avg     0.9588    0.9013    0.9270     22963
weighted avg     0.9609    0.9611    0.9598     22963

          Predicted C  Predicted U
Actual C         3190          746
Actual U          148        18879
----------------------------------------------------------------------
Positive words: ['improperly', 'foxit', 'needed', 'cvss', 'password', 'objects', 'attackers', 'forgery', 'sql', 'csrf']
Negative words: ['xss', 'significantly', 'scripting', 'redirect', 'sandbox', 'escape', 'cross', 'hyper', 'javascript', 'ssrf']
----------y_train: cvssV3_confidentialityImpact
HIGH                            13531
NONE                             4914
LOW                              4518
dtype: int64
----------y_test: cvssV3_confidentialityImpact

  y = column_or_1d(y, warn=True)


accuracy: 0.8603405478378261
              precision    recall  f1-score   support

        HIGH     0.8567    0.9402    0.8965     13532
         LOW     0.9019    0.7408    0.8134      4517
        NONE     0.8378    0.7503    0.7916      4914

    accuracy                         0.8603     22963
   macro avg     0.8654    0.8104    0.8338     22963
weighted avg     0.8615    0.8603    0.8577     22963

          Predicted H  Predicted L  Predicted N
Actual H        12723          280          529
Actual L          986         3346          185
Actual N         1143           84         3687
----------------------------------------------------------------------
Class: HIGH
Positive words: ['possibly', 'critical', 'takeover', 'sql', 'escalation', 'execute', 'unspecified', 'confidentiality', 'complete', 'execution']
Negative words: ['subset', 'xss', 'delete', 'denial', 'vectors', 'deletion', 'qemu', 'javascript', 'spoofing', 'overall']
Class: LOW
Positive words: ['javascript', 'access

  y = column_or_1d(y, warn=True)


accuracy: 0.8687889213081914
              precision    recall  f1-score   support

        HIGH     0.8641    0.9105    0.8867     11759
         LOW     0.9284    0.8094    0.8649      4119
        NONE     0.8464    0.8340    0.8402      7085

    accuracy                         0.8688     22963
   macro avg     0.8797    0.8513    0.8639     22963
weighted avg     0.8702    0.8688    0.8684     22963

          Predicted H  Predicted L  Predicted N
Actual H        10707          198          854
Actual L          567         3334          218
Actual N         1117           59         5909
----------------------------------------------------------------------
Class: HIGH
Positive words: ['deletion', 'modification', 'write', 'corruption', 'unspecified', 'takeover', 'execute', 'csrf', 'escalation', 'execution']
Negative words: ['xss', 'needed', 'leak', 'denial', 'information', 'disclose', 'redirect', 'need', 'javascript', 'enumerate']
Class: LOW
Positive words: ['reflected', 'inject

  y = column_or_1d(y, warn=True)


accuracy: 0.8810695466620215
              precision    recall  f1-score   support

        HIGH     0.8901    0.9143    0.9021     13071
         LOW     0.8349    0.3182    0.4608       572
        NONE     0.8691    0.8690    0.8690      9320

    accuracy                         0.8811     22963
   macro avg     0.8647    0.7005    0.7440     22963
weighted avg     0.8802    0.8811    0.8777     22963

          Predicted H  Predicted L  Predicted N
Actual H        11951           31         1089
Actual L          259          182          131
Actual N         1216            5         8099
----------------------------------------------------------------------
Class: HIGH
Positive words: ['escalation', 'escalate', 'credentials', 'availability', 'execution', 'execute', 'corruption', 'takeover', 'crash', 'denial']
Negative words: ['xss', 'partial', 'scripting', 'disclosure', 'information', 'disclose', 'needed', 'redirect', 'cpanel', 'expose']
Class: LOW
Positive words: ['cause', 'pro