In [97]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

In [98]:
file = "CVSSv3.1.csv"
dataset = pd.read_csv(file)
df = pd.DataFrame(dataset)
print(df.head())

          cve_id     published_date  \
0  CVE-2019-0001  2019-01-15T21:29Z   
1  CVE-2019-0002  2019-01-15T21:29Z   
2  CVE-2019-0003  2019-01-15T21:29Z   
3  CVE-2019-0004  2019-01-15T21:29Z   
4  CVE-2019-0006  2019-01-15T21:29Z   

                                         description  \
0  Receipt of a malformed packet on MX Series dev...   
1  On EX2300 and EX3400 series, stateless firewal...   
2  When a specific BGP flowspec configuration is ...   
3  On Juniper ATP, the API key and the device key...   
4  A certain crafted HTTP packet can trigger an u...   

                                  vector_string attack_vector  \
0  CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H       NETWORK   
1  CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H       NETWORK   
2  CVSS:3.1/AV:N/AC:H/PR:N/UI:N/S:U/C:N/I:N/A:H       NETWORK   
3  CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:N/A:N         LOCAL   
4  CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H       NETWORK   

  attack_complexity privileges_requir

In [99]:
LE = LabelEncoder()
df_le = df.copy()
#df_le = df_le.drop(['cve_id', 'published_date', 'description', 'vector_string'], axis=1)
df_le['attack_vector'] = LE.fit_transform(df_le['attack_vector'])
df_le['attack_complexity'] = LE.fit_transform(df_le['attack_complexity'])
df_le['privileges_required'] = LE.fit_transform(df_le['privileges_required'])
df_le['user_interaction'] = LE.fit_transform(df_le['user_interaction'])
df_le['scope'] = LE.fit_transform(df_le['scope'])
df_le['confidentiality_impact'] = LE.fit_transform(df_le['confidentiality_impact'])
df_le['integrity_impact'] = LE.fit_transform(df_le['integrity_impact'])
df_le['availability_impact'] = LE.fit_transform(df_le['availability_impact'])
df_le['base_severity'] = LE.fit_transform(df_le['base_severity'])
df_le.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39533 entries, 0 to 39532
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   attack_vector           39533 non-null  int64  
 1   attack_complexity       39533 non-null  int64  
 2   privileges_required     39533 non-null  int64  
 3   user_interaction        39533 non-null  int64  
 4   scope                   39533 non-null  int64  
 5   confidentiality_impact  39533 non-null  int64  
 6   integrity_impact        39533 non-null  int64  
 7   availability_impact     39533 non-null  int64  
 8   exploitability_score    39533 non-null  float64
 9   impact_score            39533 non-null  float64
 10  base_score              39533 non-null  float64
 11  base_severity           39533 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 3.6 MB


In [100]:
df_le.isnull().sum()

attack_vector             0
attack_complexity         0
privileges_required       0
user_interaction          0
scope                     0
confidentiality_impact    0
integrity_impact          0
availability_impact       0
exploitability_score      0
impact_score              0
base_score                0
base_severity             0
dtype: int64

In [101]:
df_le

Unnamed: 0,attack_vector,attack_complexity,privileges_required,user_interaction,scope,confidentiality_impact,integrity_impact,availability_impact,exploitability_score,impact_score,base_score,base_severity
0,2,1,2,0,1,2,2,0,3.9,3.6,7.5,1
1,2,1,2,0,1,0,0,0,3.9,5.9,9.8,0
2,2,0,2,0,1,2,2,0,2.2,3.6,5.9,3
3,1,1,1,0,1,0,2,2,1.8,3.6,5.5,3
4,2,1,2,0,1,0,0,0,3.9,5.9,9.8,0
...,...,...,...,...,...,...,...,...,...,...,...,...
39528,2,1,0,0,1,0,0,0,1.2,5.9,7.2,1
39529,2,1,1,0,1,1,2,2,2.8,1.4,4.3,3
39530,2,1,2,1,0,1,1,2,2.8,2.7,6.1,3
39531,2,1,0,0,1,0,0,0,1.2,5.9,7.2,1


In [102]:
X = df_le.drop('base_score', axis=1)
y = df_le['base_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [103]:
model = LinearRegression()

model.fit(X_train, y_train)

In [104]:
y_pred = model.predict(X_test)

In [105]:
reg_model_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred})
reg_model_diff

Unnamed: 0,Actual value,Predicted value
4884,7.5,7.529612
624,7.8,7.791355
6491,4.4,4.341084
18985,8.8,8.741002
7600,8.8,8.762516
...,...,...
8504,3.3,3.264983
6384,7.5,7.529612
25912,3.7,3.641095
11275,9.1,9.136829


In [106]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = np.sqrt(mean_squared_error(y_test, y_pred))

print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print('Root Mean Square Error:', r2)

Mean Absolute Error: 0.04646216091583318
Mean Square Error: 0.003947842766852442
Root Mean Square Error: 0.06283186108060497
