In [1]:
# import packages
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib
import json

TRAINING DATASET PREPROCESSING

In [2]:
#import the previously cleaned dataset
train_data_path = r'Processed_datasets/cleaned_train_data.csv'
train_data = pd.read_csv(train_data_path)

In [3]:
train_data.head()

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,evidencerole,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour
0,612,123247,7,6,initialaccess,truepositive,ip,related,289573,117668,3586,5,66,31,6,3,2024,6,4,6
1,326,210035,58,43,exfiltration,falsepositive,user,impacted,289573,117668,3586,5,66,242,1445,10630,2024,6,14,3
2,58352,712507,423,298,initialaccess,falsepositive,url,related,289573,117668,3586,5,66,242,1445,10630,2024,6,13,4
3,32992,774301,2,2,commandandcontrol,benignpositive,url,related,289573,117668,3586,5,66,242,1445,10630,2024,6,10,16
4,4359,188041,9,74,execution,truepositive,user,impacted,289573,117668,3586,5,66,242,1445,10630,2024,6,15,1


In [4]:
train_data.nunique()

incidentid        14378
alertid           20621
detectorid         1181
alerttitle         3974
category             17
incidentgrade         3
entitytype           20
evidencerole          2
filename           1530
folderpath          936
resourceidname       23
osfamily              4
osversion            10
countrycode          79
state               193
city                343
year                  2
month                 6
day                  29
hour                 24
dtype: int64

In [5]:
# check for outliers in date manually
unique_year = train_data['year'].unique()
unique_month = train_data['month'].unique()
unique_day = train_data['day'].unique()
unique_hour = train_data['hour'].unique()

print("Unique values in 'year' column:", unique_year)
print("Unique values in 'month' column:", unique_month)
print("Unique values in 'day' column:", unique_day)
print("Unique values in 'hour' column:", unique_hour)

Unique values in 'year' column: [2024 2023]
Unique values in 'month' column: [ 6  5  2  1 12  3]
Unique values in 'day' column: [ 4 14 13 10 15  6  9  8 12  7 26  3  5 11 29 21 24  1 25 16 28 27 30 31
 23 20  2 22 17]
Unique values in 'hour' column: [ 6  3  4 16  1 13 23 10  2 18  7  0 21 14 17 20  5  8 22 19 12 11 15  9]


outlier treatment

In [6]:
numeric_columns = train_data.select_dtypes(include=['number']).columns

def cap_outliers(df, column, factor=1.5):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    return df[column].clip(lower_bound, upper_bound)

# Apply outlier treatment only to numerical columns
# numerical_columns = train_data.select_dtypes(include=['number']).columns
# for col in numeric_columns:
#     train_data[col] = cap_outliers(train_data, col)

# # Check the result
# print(train_data.nunique())

WE DONT NEED OUTLIER TREATMENT FOR THESE COLUMNS BECAUSE THEY ONLY HAVE DISCRETE IDENTIFIERS AND CATEGORICAL COLUMNS. AS WE MANUALLY CHECKED THE DATETIME WHICH DOESNT CONTAIN ANY OUTLIERS

ENCODING

In [7]:
train_data.select_dtypes(include=['object']).columns

Index(['category', 'incidentgrade', 'entitytype', 'evidencerole'], dtype='object')

one hot encoding

In [8]:
# One-Hot Encoding
train_data = pd.get_dummies(train_data, columns=['evidencerole'], drop_first=True)

In [9]:
train_data.head(2)

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour,evidencerole_related
0,612,123247,7,6,initialaccess,truepositive,ip,289573,117668,3586,5,66,31,6,3,2024,6,4,6,True
1,326,210035,58,43,exfiltration,falsepositive,user,289573,117668,3586,5,66,242,1445,10630,2024,6,14,3,False


label encoding

In [24]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Identify categorical columns
categorical_columns = ['category', 'incidentgrade', 'entitytype']

# Apply LabelEncoder to each categorical column and save it
for col in categorical_columns:
    # Fit and transform the column using LabelEncoder
    train_data[col] = label_encoder.fit_transform(train_data[col])
    
    # Save the LabelEncoder for this column
    joblib.dump(label_encoder, f"D:\\Classifying-Cybersecurity-Incidents\\PKL_Files\\TRAIN\\{col}.pkl")

# Print confirmation message
print("The DataFrame `train_data` now has label-encoded categorical columns.")
print("The LabelEncoders for each column are saved as .pkl files in the 'PKL' folder.")


The DataFrame `train_data` now has label-encoded categorical columns.
The LabelEncoders for each column are saved as .pkl files in the 'PKL' folder.


In [26]:
train_data.head()

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour,evidencerole_related
0,612,123247,7,6,9,2,7,289573,117668,3586,5,66,31,6,3,2024,6,4,6,True
1,326,210035,58,43,6,1,19,289573,117668,3586,5,66,242,1445,10630,2024,6,14,3,False
2,58352,712507,423,298,9,1,18,289573,117668,3586,5,66,242,1445,10630,2024,6,13,4,True
3,32992,774301,2,2,1,0,18,289573,117668,3586,5,66,242,1445,10630,2024,6,10,16,True
4,4359,188041,9,74,5,2,19,289573,117668,3586,5,66,242,1445,10630,2024,6,15,1,False


In [None]:
numerical_columns

scaling

In [None]:
# scaler = StandardScaler()

# Select the continuous numerical columns to scale
# columns_to_scale = []

# Apply Standard Scaling
# train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])

# Show the scaled data
# print(train_data[columns_to_scale].head())

WE DONT APPLY ANY SCALING BECAUSE WE DONT HAVE ANY CONTINOUS VALUES LIKE PRICE,AGE INSTEAD IT HAS ID,CATEGORICAL COLUMNS

In [27]:
train_data.shape

(22293, 20)

In [28]:
train_data.describe()

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour
count,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0
mean,69985.685103,401845.8,102.437088,2874.114206,7.991791,0.906697,10.653927,262750.454358,107807.664155,3583.000807,4.898085,64.653524,223.724667,1350.266586,9929.542771,2023.999955,5.909792,9.789216,12.184856
std,119388.551059,454648.8,399.278423,11025.613849,3.576956,0.88135,5.595328,80890.968639,31935.469709,97.635623,0.70566,9.320349,62.761817,353.361415,2617.67434,0.006698,0.295124,6.176927,6.748757
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2023.0,1.0,1.0,0.0
25%,505.0,23774.0,2.0,2.0,6.0,0.0,7.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,6.0,5.0,7.0
50%,10049.0,214253.0,9.0,11.0,9.0,1.0,9.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,6.0,9.0,13.0
75%,83990.0,662610.0,44.0,188.0,9.0,2.0,18.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,6.0,13.0,18.0
max,599449.0,1718609.0,8983.0,109424.0,16.0,2.0,19.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,12.0,31.0,23.0


In [29]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22293 entries, 0 to 22292
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   incidentid            22293 non-null  int64
 1   alertid               22293 non-null  int64
 2   detectorid            22293 non-null  int64
 3   alerttitle            22293 non-null  int64
 4   category              22293 non-null  int64
 5   incidentgrade         22293 non-null  int64
 6   entitytype            22293 non-null  int64
 7   filename              22293 non-null  int64
 8   folderpath            22293 non-null  int64
 9   resourceidname        22293 non-null  int64
 10  osfamily              22293 non-null  int64
 11  osversion             22293 non-null  int64
 12  countrycode           22293 non-null  int64
 13  state                 22293 non-null  int64
 14  city                  22293 non-null  int64
 15  year                  22293 non-null  int64
 16  mont

In [30]:
# Export Datasets
train_data.to_csv(r"Processed_datasets/train_preprocessed.csv", index=False)

TESTING DATASET PREPROCESSING

In [31]:
#import the previously cleaned dataset
test_data_path = r'Processed_datasets/cleaned_test_data.csv'
test_data = pd.read_csv(test_data_path)

In [32]:
test_data.head()

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,evidencerole,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour
0,11767,87199,524,563,lateralmovement,benignpositive,user,impacted,289573,117668,3586,5,66,242,1445,10630,2024,6,4,22
1,91158,632273,2,2,commandandcontrol,benignpositive,machine,impacted,289573,117668,3586,0,0,242,1445,10630,2024,6,3,12
2,32247,131719,2932,10807,lateralmovement,benignpositive,process,related,14,22,3586,5,66,242,1445,10630,2024,6,8,3
3,15294,917686,0,0,initialaccess,falsepositive,cloudlogonsession,related,289573,117668,3586,5,66,242,1445,10630,2024,6,12,12
4,7615,5944,27,18,discovery,benignpositive,user,impacted,289573,117668,3586,5,66,242,1445,10630,2024,6,6,17


In [33]:
test_data.nunique()

incidentid        15957
alertid           25284
detectorid         1310
alerttitle         4145
category             18
incidentgrade         3
entitytype           22
evidencerole          2
filename           1963
folderpath         1124
resourceidname       25
osfamily              4
osversion            12
countrycode          84
state               227
city                403
year                  1
month                 5
day                  31
hour                 24
dtype: int64

In [34]:
# check for outliers in date manually
unique_year = test_data['year'].unique()
unique_month = test_data['month'].unique()
unique_day = test_data['day'].unique()
unique_hour = test_data['hour'].unique()

print("Unique values in 'year' column:", unique_year)
print("Unique values in 'month' column:", unique_month)
print("Unique values in 'day' column:", unique_day)
print("Unique values in 'hour' column:", unique_hour)

Unique values in 'year' column: [2024]
Unique values in 'month' column: [6 5 1 2 3]
Unique values in 'day' column: [ 4  3  8 12  6 15 26 25  2  5  7 10 13 31 14 11  9 17 16 23  1 21 27 22
 24 20 30 28 29 18 19]
Unique values in 'hour' column: [22 12  3 17  8  4 11  5 20 21 19  2 16 18 13 23 14  9 10 15  6  7  1  0]


No outlier treatment needed

ENCODING

In [35]:
test_data.select_dtypes(include=['object']).columns

Index(['category', 'incidentgrade', 'entitytype', 'evidencerole'], dtype='object')

In [36]:
test_data.head(2)

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,evidencerole,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour
0,11767,87199,524,563,lateralmovement,benignpositive,user,impacted,289573,117668,3586,5,66,242,1445,10630,2024,6,4,22
1,91158,632273,2,2,commandandcontrol,benignpositive,machine,impacted,289573,117668,3586,0,0,242,1445,10630,2024,6,3,12


label encoding

In [37]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Identify categorical columns
categorical_columns = ['category', 'incidentgrade', 'entitytype']

# Apply LabelEncoder to each categorical column and save it
for col in categorical_columns:
    # Fit and transform the column using LabelEncoder
    test_data[col] = label_encoder.fit_transform(test_data[col])
    
    # Save the LabelEncoder for this column
    joblib.dump(label_encoder, f'PKL_Files/TEST/{col}.pkl')

# Print confirmation message
print("The DataFrame `test_data` now has label encoded categorical columns.")
print("The LabelEncoders for each column are saved as .pkl files in the 'PKL' folder.")

The DataFrame `test_data` now has label encoded categorical columns.
The LabelEncoders for each column are saved as .pkl files in the 'PKL' folder.


In [38]:
test_data.head()

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,evidencerole,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour
0,11767,87199,524,563,11,0,21,impacted,289573,117668,3586,5,66,242,1445,10630,2024,6,4,22
1,91158,632273,2,2,1,0,9,impacted,289573,117668,3586,0,0,242,1445,10630,2024,6,3,12
2,32247,131719,2932,10807,11,0,16,related,14,22,3586,5,66,242,1445,10630,2024,6,8,3
3,15294,917686,0,0,10,1,5,related,289573,117668,3586,5,66,242,1445,10630,2024,6,12,12
4,7615,5944,27,18,5,0,21,impacted,289573,117668,3586,5,66,242,1445,10630,2024,6,6,17


In [39]:
train_data.shape

(22293, 20)

In [40]:
train_data.describe()

Unnamed: 0,incidentid,alertid,detectorid,alerttitle,category,incidentgrade,entitytype,filename,folderpath,resourceidname,osfamily,osversion,countrycode,state,city,year,month,day,hour
count,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0,22293.0
mean,69985.685103,401845.8,102.437088,2874.114206,7.991791,0.906697,10.653927,262750.454358,107807.664155,3583.000807,4.898085,64.653524,223.724667,1350.266586,9929.542771,2023.999955,5.909792,9.789216,12.184856
std,119388.551059,454648.8,399.278423,11025.613849,3.576956,0.88135,5.595328,80890.968639,31935.469709,97.635623,0.70566,9.320349,62.761817,353.361415,2617.67434,0.006698,0.295124,6.176927,6.748757
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2023.0,1.0,1.0,0.0
25%,505.0,23774.0,2.0,2.0,6.0,0.0,7.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,6.0,5.0,7.0
50%,10049.0,214253.0,9.0,11.0,9.0,1.0,9.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,6.0,9.0,13.0
75%,83990.0,662610.0,44.0,188.0,9.0,2.0,18.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,6.0,13.0,18.0
max,599449.0,1718609.0,8983.0,109424.0,16.0,2.0,19.0,289573.0,117668.0,3586.0,5.0,66.0,242.0,1445.0,10630.0,2024.0,12.0,31.0,23.0


In [41]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22293 entries, 0 to 22292
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   incidentid            22293 non-null  int64
 1   alertid               22293 non-null  int64
 2   detectorid            22293 non-null  int64
 3   alerttitle            22293 non-null  int64
 4   category              22293 non-null  int64
 5   incidentgrade         22293 non-null  int64
 6   entitytype            22293 non-null  int64
 7   filename              22293 non-null  int64
 8   folderpath            22293 non-null  int64
 9   resourceidname        22293 non-null  int64
 10  osfamily              22293 non-null  int64
 11  osversion             22293 non-null  int64
 12  countrycode           22293 non-null  int64
 13  state                 22293 non-null  int64
 14  city                  22293 non-null  int64
 15  year                  22293 non-null  int64
 16  mont

In [42]:
# Export Datasets
test_data.to_csv(r"Processed_datasets/test_preprocessed.csv", index=False)