In [206]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [207]:
seed = 42

# Data preparation

In [208]:
def clean_df(df):
    return df

def prep_features(df: pd.DataFrame):
    df = df.drop(["ID", "Attack Type"], axis=1, errors='ignore')

    df["User_Behavior_Score"] = df["User_Behavior_Score"].fillna(
        df["User_Behavior_Score"].mean()
    )

    # datetime features
    df["Timestamp"] = pd.to_datetime(df["Timestamp"])
    df["Is_AM"] = df["Timestamp"].dt.hour < 12
    df = df.drop(["Timestamp"], axis=1)

    return df

In [209]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [210]:
df_train.head()

Unnamed: 0,Suspicious_Port_Activity,Traffic_Volume_Variation,Packet_Length_Anomaly,Malware_Score,Threat_Level_Index,User_Behavior_Score,Geo_Dispersion,Payload_Entropy,Login_Attempts,Device_Response_Time,Session_Duration,Packet_Retry_Rate,Anomaly_Tendency,Is_AM
0,51.454294,53.193234,50.854265,0.343061,-2.829257,-1.150952,96.510346,51.638318,14271350.0,825426.5,40.059209,21.344372,-27.224217,True
1,66.049091,60.86573,63.611018,5.242388,-5.684331,-0.996886,4.20341,9.657115,1300.129,0.0,-5.96837,30.608016,-13.656677,True
2,29.509985,29.38192,26.47766,13.095004,3.713273,-1.542933,56.986902,42.720874,653.7937,0.0,17.269407,11.348135,-6.043118,False
3,47.698458,47.038595,47.511664,-11.713488,-5.855172,-0.11492,62.225258,56.349504,3815.307,0.0,15.670151,20.733735,-7.827522,False
4,52.066922,51.621483,50.422555,18.49112,10.658195,1.540859,25.647005,13.831798,-25128380.0,4236788.0,12.687819,24.197432,16.831704,False


# Exploratory Data Analysis

In [211]:
df_train.isna().sum()

Suspicious_Port_Activity    0
Traffic_Volume_Variation    0
Packet_Length_Anomaly       0
Malware_Score               0
Threat_Level_Index          0
User_Behavior_Score         0
Geo_Dispersion              0
Payload_Entropy             0
Login_Attempts              0
Device_Response_Time        0
Session_Duration            0
Packet_Retry_Rate           0
Anomaly_Tendency            0
Is_AM                       0
dtype: int64

# Models

In [212]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train, df["Attack Type"], test_size=0.2, random_state=seed
)

In [213]:
def evaluate(clf):
    scores = cross_val_score(clf, X_train, y_train, cv=3, n_jobs=-1)
    return scores.mean() - scores.std()

In [214]:
lr = LogisticRegression()

evaluate(lr)

0.7457757565215196

In [215]:
rf = RandomForestClassifier(n_estimators=300)

evaluate(rf)

0.8947428354809536

In [216]:
clf = rf

clf.fit(X_train, y_train)

# Submission

In [217]:
test_df = pd.read_csv("test_data.csv")
test_df = clean_df(test_df)
features = prep_features(test_df)

In [218]:
# subtask 1
subtask1 = features["Is_AM"].map(lambda x: "AM" if x else "PM")

# subtask 2
subtask2 = clf.predict(features)

In [219]:
def build_subtask_df(subtask_id, answer):
    return pd.DataFrame({"subtaskID": subtask_id, "datapointID": test_df["ID"], "answer": answer})

subtasks = [
    (1, subtask1),
    (2, subtask2)
]

submission = pd.concat([build_subtask_df(sid, answer) for sid, answer in subtasks], ignore_index=True)

submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,12633,PM
1,1,24448,PM
2,1,14208,AM
3,1,39654,AM
4,1,337,AM


In [220]:
submission.to_csv("submission.csv", index=False)