# Microsoft : Classifying Cybersecurity Incidents with Machine Learning

## Part 2 - Model Training and Evaluation

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [4]:
#%%bigquery df_train
#SELECT * FROM `data.mscyberdataset.train_table`

Query is running:   0%|          |



Downloading:   0%|          |

In [5]:
df_train.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,RegistryKey,RegistryValueName,ApplicationId,OAuthApplicationId,ResourceIdName,OSFamily,CountryCode,Day,Month,Hour
0,137438953842,64,6661,4731,366,155,5,0,32,0,...,1631,635,2251,881,3586,5,242,3,6,17
1,824633721226,539,102061,1475076,880,1476,10,0,32,0,...,1631,635,2251,881,3586,5,242,8,6,18
2,996432413087,54,527816,1442982,132,9360,7,0,32,0,...,1631,635,2251,881,3586,5,242,6,6,12
3,1271310321504,795,41259,70207,161,131,16,0,32,0,...,1631,635,2251,881,3586,5,242,14,6,1
4,704374637112,576,30787,45958,163,134,10,0,32,0,...,1631,635,2251,881,3586,5,242,14,6,20


In [8]:
#Splitting data
X= df_train.drop('IncidentGrade',axis=1)
y= df_train['IncidentGrade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
#Selecting top features using anova 
from sklearn.feature_selection import f_classif, SelectKBest

selector = SelectKBest(score_func=f_classif, k=15)  # Adjust k as needed
X_new = selector.fit_transform(X_train, y_train)

selected_features = X_train.columns[selector.get_support()]
print("Selected Features:", selected_features)

Selected Features: Index(['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'Sha256', 'IpAddress',
       'AccountSid', 'DeviceName', 'NetworkMessageId', 'CountryCode', 'Day'],
      dtype='object')


In [10]:
X_new=X[['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'Sha256', 'IpAddress',
       'AccountSid', 'DeviceName', 'NetworkMessageId', 'CountryCode', 'Day']]
X_new.head()

Unnamed: 0,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,EntityType,EvidenceRole,Sha256,IpAddress,AccountSid,DeviceName,NetworkMessageId,CountryCode,Day
0,64,6661,4731,366,155,5,32,0,138268,360606,441377,153085,529644,242,3
1,539,102061,1475076,880,1476,10,32,0,138268,360606,203,153085,529644,242,8
2,54,527816,1442982,132,9360,7,32,0,138268,360606,30741,153085,529644,242,6
3,795,41259,70207,161,131,16,32,0,138268,360606,3,153085,529644,242,14
4,576,30787,45958,163,134,10,32,0,138268,360606,25560,153085,529644,242,14


In [23]:
#Training train data with selected features

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# Initialize the models
model_rf = RandomForestClassifier(random_state=42)
model_xgb = XGBClassifier(random_state=42)

#RANDOM FOREST
print(f"\nEvaluating RandomForest...")

# Fit the model
model_rf.fit(X_train, y_train)

# Predict on test data
y_pred_rf = model_rf.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))

#XGBOOST
print(f"\nEvaluating XGBoost...")

# Fit the model 
model_xgb.fit(X_train, y_train)

# Predict on test data
y_pred_xgb = model_xgb.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb))


Evaluating RandomForest...
Accuracy: 0.9877273651482162
Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99    823049
         1.0       0.98      0.98      0.98    405664
         2.0       0.99      0.99      0.99    659879

    accuracy                           0.99   1888592
   macro avg       0.99      0.99      0.99   1888592
weighted avg       0.99      0.99      0.99   1888592


Evaluating XGBoost...
Accuracy: 0.917744012470666
Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92    823049
         1.0       0.93      0.85      0.89    405664
         2.0       0.94      0.91      0.93    659879

    accuracy                           0.92   1888592
   macro avg       0.92      0.91      0.91   1888592
weighted avg       0.92      0.92      0.92   1888592



In [12]:
import joblib

# Save the trained RandomForest model to a file
joblib.dump(model_rf, 'model/random_forest_model.pkl')

In [16]:
#%%bigquery df_test
#SELECT * FROM `data.mscyberdataset.test_table`

Query is running:   0%|          |



Downloading:   0%|          |

In [17]:
df_test.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,RegistryKey,RegistryValueName,ApplicationId,OAuthApplicationId,ResourceIdName,OSFamily,CountryCode,Day,Month,Hour
0,1640677510775,753,24182,289374,541,485,10,0,2,0,...,1631,635,2251,881,22,5,242,5,6,21
1,1382979474591,1678,9685,1106034,1083,1927,7,0,2,0,...,1631,635,2251,881,4,5,242,13,6,12
2,1580547969849,878,21542,1691132,4429,26351,16,1,2,0,...,1631,635,2251,881,1499,5,242,10,6,7
3,463856473055,46,52165,641241,154,126,10,2,2,0,...,1631,635,2251,881,0,5,242,7,6,17
4,34359738465,67,522340,640492,154,126,10,0,2,0,...,1631,635,2251,881,2,5,242,12,6,13


In [18]:
X2 = df_test[['OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle',
       'Category', 'EntityType', 'EvidenceRole', 'Sha256', 'IpAddress',
       'AccountSid', 'DeviceName', 'NetworkMessageId', 'CountryCode', 'Day']]
y2= df_test['IncidentGrade']

In [13]:
# Load the model from file
loaded_model_rf = joblib.load('model/random_forest_model.pkl')

In [31]:
y_pred_new = loaded_model_rf.predict(X2)

In [32]:
print("Accuracy:", accuracy_score(y2, y_pred_new))
print("Classification Report:")
print(classification_report(y2, y_pred_new))

Accuracy: 0.9330143436852683
Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.95      0.94   1752895
         1.0       0.91      0.89      0.90    902664
         2.0       0.95      0.94      0.95   1492329

    accuracy                           0.93   4147888
   macro avg       0.93      0.93      0.93   4147888
weighted avg       0.93      0.93      0.93   4147888

