<a href="https://colab.research.google.com/github/sho6210/SecDev2023/blob/main/MLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [13]:
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from numpy.lib.function_base import vectorize
from sklearn.feature_extraction.text import CountVectorizer
from numpy.core.fromnumeric import size
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import tqdm
import time
import json
import re
import csv
import urllib.request
import shutil

1.2.2


# Data

In [14]:
# Load data
df = pd.read_csv("https://raw.githubusercontent.com/sho6210/SecDev2023/main/data/cve_2018-2020_complete_dataset.csv", header=0)
X = pd.DataFrame(df[['Description']])
y = pd.DataFrame(df[['cvssV3_attackVector','cvssV3_attackComplexity','cvssV3_privilegesRequired','cvssV3_userInteraction',
                      'cvssV3_scope','cvssV3_confidentialityImpact','cvssV3_integrityImpact','cvssV3_availabilityImpact']])
train_sentence, test_sentence, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Display data distribution.
print("----------y_train:", y_train.value_counts())
print("----------y_test:", y_test.value_counts())


----------y_train: cvssV3_attackVector  cvssV3_attackComplexity  cvssV3_privilegesRequired  cvssV3_userInteraction  cvssV3_scope  cvssV3_confidentialityImpact  cvssV3_integrityImpact  cvssV3_availabilityImpact
NETWORK              LOW                      NONE                       NONE                    UNCHANGED     HIGH                          HIGH                    HIGH                         2914
                                                                         REQUIRED                CHANGED       LOW                           LOW                     NONE                         1701
LOCAL                LOW                      LOW                        NONE                    UNCHANGED     HIGH                          HIGH                    HIGH                         1551
NETWORK              LOW                      NONE                       REQUIRED                UNCHANGED     HIGH                          HIGH                    HIGH                        

# Make a data set

# Natural Language Processing (train)

In [15]:
def NLP_train(train_sentence):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer
  vectorizer = CountVectorizer(stop_words="english")
  X_train = vectorizer.fit_transform(train_sentence['Description'].values)
  print('dimensions:', X_train.shape)

  return X_train, vectorizer

# Multinomial Logistic Regression (train)

In [16]:
def MLR_train(X_train, y_train):
  # Create a classification model for MLR using vectorized features
  lr = LogisticRegression(C=0.1, random_state=0, n_jobs=-1)
  lr.fit(X_train, y_train)

  return lr



# Natural Language Processing (test)

In [17]:
def NLP_test(test_sentence, vectorizer):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer.
  X_test = vectorizer.transform(test_sentence['Description'].values)

  return X_test

# Multinomial Logistic Regression (test)

In [18]:
def MLR_test(metric, X_test, y_test, lr):
  # Test data to confirm accuracy.
  y_pred = lr.predict(X_test)
  cm = confusion_matrix(y_test, y_pred)

  # Branching when creating a table.
  if metric == 'cvssV3_attackVector':
    table = pd.DataFrame(cm, columns=['Predicted A', 'Predicted L', 'Predicted N', 'Predicted P'], index=['Actual A', 'Actual L', 'Actual N', 'Actual P'])
  elif metric == 'cvssV3_attackComplexity':
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L'], index=['Actual H', 'Actual L'])
  elif metric == 'cvssV3_privilegesRequired':
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L', 'Predicted N'], index=['Actual H', 'Actual L', 'Actual N'])
  elif metric == 'cvssV3_userInteraction':
    table = pd.DataFrame(cm, columns=['Predicted N', 'Predicted R'], index=['Actual N', 'Actual R'])
  elif metric == 'cvssV3_scope':
    table = pd.DataFrame(cm, columns=['Predicted C', 'Predicted U'], index=['Actual C', 'Actual U'])
  else:
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L', 'Predicted N'], index=['Actual H', 'Actual L', 'Actual N'])

  # Accuracy
  print("accuracy:", accuracy_score(y_test, y_pred))
  print(classification_report(y_test, y_pred, digits=4))
  print(table)
  print('-'*70)

  # Returns a list containing the prediction results and a data frame for testing.
  return y_pred



## Extraction of input tokens

In [19]:
def ExtractionTokens(vectorizer, lr):
  # Get the name of the feature (word).
  feature_names = vectorizer.get_feature_names_out()

  # Binary time.
  if lr.coef_.shape[0] == 1:
    coef = lr.coef_[0]

   # Extraction of positively and negatively impacted words.
    positive_words = [feature_names[i] for i in coef.argsort()[-10:]]
    negative_words = [feature_names[i] for i in coef.argsort()[:10]]

    print('Positive words:', positive_words)
    print('Negative words:', negative_words)


  # At three values or more.
  if lr.coef_.shape[0] != 1:

    # 分類区分ごとのポジティブとネガティブな単語の抽出
    coef_matrix = lr.coef_
    for i, class_name in enumerate(lr.classes_):
      coef = coef_matrix[i]
      positive_words = [feature_names[j] for j in coef.argsort()[-10:]]
      negative_words = [feature_names[j] for j in coef.argsort()[:10]]

      print("Class:", class_name)
      print("Positive words:", positive_words)
      print("Negative words:", negative_words)
    print('-' * 50)

# Example

In [20]:
df_tmp = pd.DataFrame()
df_result = pd.DataFrame()

metrics = ['cvssV3_attackVector','cvssV3_attackComplexity','cvssV3_privilegesRequired','cvssV3_userInteraction',
           'cvssV3_scope','cvssV3_confidentialityImpact','cvssV3_integrityImpact','cvssV3_availabilityImpact']

for metric in metrics:
  X_train, vectorizer = NLP_train(train_sentence)
  lr = MLR_train(X_train, y_train[metric])

  X_test = NLP_test(test_sentence, vectorizer)
  y_pred = MLR_test(metric, X_test, y_test[metric], lr)

  ExtractionTokens(vectorizer, lr)

  df_tmp[metric] = y_test[metric]
  df_tmp[f'pred_{metric}'] = y_pred
  df_result[metric] = df_tmp[metric] == df_tmp[f'pred_{metric}']

print(df_result.value_counts())

dimensions: (22963, 36301)
accuracy: 0.9011888690502112
                  precision    recall  f1-score   support

ADJACENT_NETWORK     0.7880    0.4885    0.6031       563
           LOCAL     0.8504    0.7645    0.8051      4959
         NETWORK     0.9164    0.9630    0.9392     17128
        PHYSICAL     0.8471    0.4249    0.5660       313

        accuracy                         0.9012     22963
       macro avg     0.8505    0.6602    0.7283     22963
    weighted avg     0.8981    0.9012    0.8969     22963

          Predicted A  Predicted L  Predicted N  Predicted P
Actual A          275           32          251            5
Actual L           12         3791         1146           10
Actual N           58          566        16495            9
Actual P            4           69          107          133
----------------------------------------------------------------------
Class: ADJACENT_NETWORK
Positive words: ['wi', 'communication', 'segment', 'zte', 'devices', 'hyper',