In [2]:
pip install imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

# Step 1: Load the datasets
X_train = pd.read_csv(r"C:\Users\sonur\OneDrive\Desktop\Projects\Microsoft Cyber\VS Code\X_train.csv")
X_val = pd.read_csv(r"C:\Users\sonur\OneDrive\Desktop\Projects\Microsoft Cyber\VS Code\X_val.csv")
y_train = pd.read_csv(r"C:\Users\sonur\OneDrive\Desktop\Projects\Microsoft Cyber\VS Code\y_train.csv").squeeze()  # Convert to Series
y_val = pd.read_csv(r"C:\Users\sonur\OneDrive\Desktop\Projects\Microsoft Cyber\VS Code\y_val.csv").squeeze()  # Convert to Series

# Inspect the column names and data types to identify categorical columns
print("Column names in X_train:", X_train.columns)
print("Data types in X_train:\n", X_train.dtypes)

# Step 2: Convert date columns to numeric format (timestamp)
if 'date_column' in X_train.columns:  # Replace 'date_column' with your actual date-time column name
    X_train['date_column'] = pd.to_datetime(X_train['date_column']).apply(lambda x: x.timestamp())
    X_val['date_column'] = pd.to_datetime(X_val['date_column']).apply(lambda x: x.timestamp())

# Step 3: Label encode categorical columns (replace with actual column names)
categorical_columns = ['bus_type', 'route_name']  # replace with your actual categorical columns
label_encoder = LabelEncoder()

for col in categorical_columns:
    if col in X_train.columns:  # Check if the column exists
        X_train[col] = label_encoder.fit_transform(X_train[col])
        X_val[col] = label_encoder.transform(X_val[col])
    else:
        print(f"Column '{col}' not found in X_train. Skipping...")

# Step 4: Handle class imbalance using SMOTE
print("Handling class imbalance using SMOTE...")

# Ensure only numeric columns are passed to SMOTE
numeric_columns = X_train.select_dtypes(include=['float64', 'int64']).columns
X_train_numeric = X_train[numeric_columns]
X_val_numeric = X_val[numeric_columns]

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_numeric, y_train)

# Step 5: Train the Logistic Regression model
print("Training Logistic Regression...")
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_resampled, y_train_resampled)

# Step 6: Evaluate the model on the validation set
y_pred_logreg = logreg.predict(X_val_numeric)

# Calculate performance metrics
print("Model Evaluation:")
accuracy = accuracy_score(y_val, y_pred_logreg)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred_logreg, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_val, y_pred_logreg))


Column names in X_train: Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'Timestamp', 'DetectorId',
       'AlertTitle', 'EntityType', 'EvidenceRole', 'DeviceId', 'Sha256',
       'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId',
       'AccountName', 'DeviceName', 'NetworkMessageId', 'RegistryKey',
       'RegistryValueName', 'RegistryValueData', 'ApplicationId',
       'ApplicationName', 'OAuthApplicationId', 'FileName', 'FolderPath',
       'ResourceIdName', 'OSFamily', 'OSVersion', 'CountryCode', 'State',
       'City', 'Hour', 'DayOfWeek', 'Month', 'Category_CommandAndControl',
       'Category_CredentialAccess', 'Category_CredentialStealing',
       'Category_DefenseEvasion', 'Category_Discovery', 'Category_Execution',
       'Category_Exfiltration', 'Category_Exploit', 'Category_Impact',
       'Category_InitialAccess', 'Category_LateralMovement',
       'Category_Malware', 'Category_Persistence',
       'Category_PrivilegeEscalation', 'Category_Ransomware',
    

In [8]:
# Calculate performance metrics including macro-F1 score
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred_logreg, average='macro')

# Print the macro-F1 score
print(f"Macro-F1 Score: {f1}")


Macro-F1 Score: 0.568016749232399
