<a href="https://colab.research.google.com/github/sho6210/MLR/blob/main/MLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [None]:
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from numpy.lib.function_base import vectorize
from sklearn.feature_extraction.text import CountVectorizer
from numpy.core.fromnumeric import size
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import tqdm
import time
import json
import re
import csv
import urllib.request
import shutil

1.2.2


# Data load

In [None]:
# timer
time_sta = time.time()

# header
HEADER = ['CVE-ID', 'Description', 'CVSS Base Score', 'AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A'] 
filelist = ['07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19']

# Reading json files
for i in filelist:
  urllib.request.urlretrieve('https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-20{}.json.zip'.format(i), "nvdcve-1.1-20{}.json.zip".format(i))
  shutil.unpack_archive('/content/nvdcve-1.1-20{}.json.zip'.format(i), '/content')
  filename_josn = '/content/nvdcve-1.1-20{}.json'.format(i)
  json_open = open(filename_josn, 'r')
  json_load = json.load(json_open)

  # Output to cvs file
  filename_csv = './data_set_20{0}.csv'.format(i)
  with open(filename_csv, 'w', encoding='utf-8') as f:
    # Header output
    writer = csv.writer(f)
    writer.writerow(HEADER)

    # Output of contents
    for j in range(len(json_load["CVE_Items"])):

      # Initialisation of variables
      out_description = ""
      out_id = ""
      
      try:
        for i in json_load["CVE_Items"][j]["cve"]["description"]['description_data']:
          out_description += i['value']
        out_id = json_load["CVE_Items"][j]["cve"]["CVE_data_meta"]["ID"]
        out_cvss_score = json_load["CVE_Items"][j]["impact"]["baseMetricV3"]["cvssV3"]["baseScore"]
        out_cvss_vector = json_load["CVE_Items"][j]["impact"]["baseMetricV3"]["cvssV3"]["vectorString"]
        if out_cvss_vector != None:
          out_cvss_vector_AV = re.search(r"AV:.", out_cvss_vector).group()
          out_cvss_vector_AC = re.search(r"AC:.", out_cvss_vector).group()
          out_cvss_vector_PR = re.search(r"PR:.", out_cvss_vector).group()
          out_cvss_vector_UI = re.search(r"UI:.", out_cvss_vector).group()
          out_cvss_vector_S = re.search(r"/S:.", out_cvss_vector).group()
          out_cvss_vector_C = re.search(r"/C:.", out_cvss_vector).group()
          out_cvss_vector_I = re.search(r"/I:.", out_cvss_vector).group()
          out_cvss_vector_A = re.search(r"/A:.", out_cvss_vector).group()
      
      except KeyError:
        continue    

      # Output
      row = [out_id, out_description, out_cvss_score, 
            out_cvss_vector_AV, out_cvss_vector_AC, out_cvss_vector_PR, 
            out_cvss_vector_UI, out_cvss_vector_S, out_cvss_vector_C, 
            out_cvss_vector_I, out_cvss_vector_A]
      writer.writerow(row)


# Data-Set

In [None]:
df_2007 = pd.read_csv("/content/data_set_2007.csv", header=0)
df_2008 = pd.read_csv("/content/data_set_2008.csv", header=0)
df_2009 = pd.read_csv("/content/data_set_2009.csv", header=0)
df_2010 = pd.read_csv("/content/data_set_2010.csv", header=0)
df_2011 = pd.read_csv("/content/data_set_2011.csv", header=0)
df_2012 = pd.read_csv("/content/data_set_2012.csv", header=0)
df_2013 = pd.read_csv("/content/data_set_2013.csv", header=0)
df_2014 = pd.read_csv("/content/data_set_2014.csv", header=0)
df_2015 = pd.read_csv("/content/data_set_2015.csv", header=0)
df_2016 = pd.read_csv("/content/data_set_2016.csv", header=0)
df_2017 = pd.read_csv("/content/data_set_2017.csv", header=0)
df_2018 = pd.read_csv("/content/data_set_2018.csv", header=0)
df_2019 = pd.read_csv("/content/data_set_2019.csv", header=0)


# df_0000 storage list
list_df = [df_2007, df_2008, df_2009, df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019]

# Reading and data splitting

In [None]:
def DataMake(split, metric, list_df):
  # variable
  label_train = []
  label_test = []
  sentences_train = []
  sentences_test = []

  # Split the correct answer labels and target sentences into study and test.
  list_train = list_df[:split]
  list_test = list_df[split:]
  print('metric:', metric)
  print('train:', len(list_train), 'test:', len(list_test))

  # Store data frames for testing in df_test.
  df_test = pd.concat(list_test)

  # Extract and list metric value in a list.
  for i in list_train:
    label_train.append(i[metric].values)
    sentences_train.append(i['Description'].values)
  for i in list_test:
    label_test.append(i[metric].values)
    sentences_test.append(i['Description'].values)

  # data for input
  # metric value
  y_train = np.concatenate(label_train, 0)
  y_test = np.concatenate(label_test, 0)
  # description
  train_sentence = np.concatenate(sentences_train, 0)
  test_sentence = np.concatenate(sentences_test, 0)

  return y_train, y_test, train_sentence, test_sentence, df_test

# Natural Language Processing (train)

In [None]:
def NLP_train(train_sentence):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer
  vectorizer = CountVectorizer(stop_words="english")
  X_train = vectorizer.fit_transform(train_sentence)
  
  return X_train, vectorizer

# Multinomial Logistic Regression (train)

In [None]:
def MLR_train(X_train, y_train):
  # Create a classification model for MLR using vectorized features
  lr = LogisticRegression(C=0.1, random_state=0, n_jobs=-1)
  lr.fit(X_train, y_train)

  return lr



# Natural Language Processing (test)

In [None]:
def NLP_test(test_sentence, vectorizer):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer.
  X_test = vectorizer.transform(test_sentence)
  
  return X_test

# Multinomial Logistic Regression (test)

In [None]:
def MLR_test(metric, X_test, y_test, lr):
  # Test data to confirm accuracy.
  y_pred = lr.predict(X_test)
  cm = confusion_matrix(y_test, y_pred)

  # Branching when creating a table.
  if metric == 'AV':
    table = pd.DataFrame(cm, columns=['Predicted A', 'Predicted L', 'Predicted N', 'Predicted P'], index=['Actual A', 'Actual L', 'Actual N', 'Actual P'])
  elif metric == 'AC':
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L'], index=['Actual H', 'Actual L'])
  elif metric == 'PR':
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L', 'Predicted N'], index=['Actual H', 'Actual L', 'Actual N'])
  elif metric == 'UI':
    table = pd.DataFrame(cm, columns=['Predicted N', 'Predicted R'], index=['Actual N', 'Actual R'])
  elif metric == 'S':
    table = pd.DataFrame(cm, columns=['Predicted C', 'Predicted U'], index=['Actual C', 'Actual U'])
  else:
    table = pd.DataFrame(cm, columns=['Predicted H', 'Predicted L', 'Predicted N'], index=['Actual H', 'Actual L', 'Actual N'])

  # Accuracy
  print("accuracy:", accuracy_score(y_test, y_pred))
  print(table)
  print('-'*70)

  # Returns a list containing the prediction results and a data frame for testing.
  return y_pred, df_test



# Example

In [None]:

metrics = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']
split = 12

for metric in metrics:
  y_train, y_test, train_sentence, test_sentence, df_test = DataMake(split, metric, list_df)

  X_train, vectorizer = NLP_train(train_sentence)
  lr = MLR_train(X_train, y_train)

  X_test = NLP_test(test_sentence, vectorizer)
  MLR_test(metric, X_test, y_test, lr)

# timer
time_end = time.time()
print('Processing time:', time_end - time_sta)

metric: AV
train: 12 test: 1
accuracy: 0.8685452077812964
          Predicted A  Predicted L  Predicted N  Predicted P
Actual A          151           27          201            1
Actual L            9         2299          902            8
Actual N           47          714        10894            5
Actual P            1           42           77           95
----------------------------------------------------------------------
metric: AC
train: 12 test: 1
accuracy: 0.9555354488463775
          Predicted H  Predicted L
Actual H          374          524
Actual L          164        14411
----------------------------------------------------------------------
metric: PR
train: 12 test: 1
accuracy: 0.7880178375234279
          Predicted H  Predicted L  Predicted N
Actual H          372          378          392
Actual L          118         2516         1692
Actual N           61          639         9305
----------------------------------------------------------------------
metric: UI
