In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
df = pd.read_csv("labeled.csv", nrows=20000)  # small chunk
df.head()

In [None]:
# Drop unnecessary columns
df = df.drop(["no", "extra", "time"], axis=1)

# Encode categorical features
categorical_cols = ["ip", "method", "url", "protocol", "referrer", "user_agent", "type"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Split features and target
X = df.drop("label", axis=1)
y = df["label"]

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


In [None]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

In [None]:
print(f"Accuracy: {clf.score(X_test, y_test):.4f}")
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
clf.predict([X_train[99]])

In [None]:
clf.predict([[0.5, -0.3, 1.2, 0.0, -0.8, 0.4, -1.1, 0.7, 0.2]])