In [14]:
import time
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [15]:
print("Reading data...", flush=True)
df = pd.read_csv("labeled.csv" , nrows=2000000)
print(f"Loaded {len(df):,} rows", flush=True)
df.head()

Reading data...
Loaded 2,000,000 rows


Unnamed: 0,ip,time,method,url,protocol,status,size,referrer,user_agent,extra,no,label,type
0,31.56.96.51,2019-01-22 00:26:16+00:00,GET,/image/60844/productModel/200x200,HTTP/1.1,200,5667,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...,-,2,0,benign
1,31.56.96.51,2019-01-22 00:26:16+00:00,GET,/image/61474/productModel/200x200,HTTP/1.1,200,5379,https://www.zanbil.ir/m/filter/b113,Mozilla/5.0 (Linux; Android 6.0; ALE-L21 Build...,-,3,0,benign
2,91.99.72.15,2019-01-22 00:26:17+00:00,GET,/product/31893/62100/سشوار-خانگی-پرنسلی-مدل-PR...,HTTP/1.1,200,41483,-,Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16...,-,5,0,benign
3,178.253.33.51,2019-01-22 00:26:19+00:00,GET,/m/product/32574/62991/ماشین-اصلاح-صورت-پرنسلی...,HTTP/1.1,200,20406,"https://www.zanbil.ir/m/filter/p5767,t156?name...",Mozilla/5.0 (Linux; Android 5.1; HTC Desire 72...,-,13,0,benign
4,91.99.72.15,2019-01-22 00:26:19+00:00,GET,/product/10075/13903/مایکروفر-رومیزی-سامسونگ-م...,HTTP/1.1,200,41725,-,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,-,15,0,benign


In [16]:
print("Encoding features...", flush=True)
df = df.drop(["no", "extra", "time"], axis=1)

categorical_cols = ["ip", "method", "url", "protocol", "referrer", "user_agent", "type"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

X = df.drop("label", axis=1)
y = df["label"]

print("Scaling...", flush=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Done!", flush=True)

Encoding features...
Scaling...
Done!


In [17]:
print("Splitting...", flush=True)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)
print(f"Train: {len(X_train):,} rows | Test: {len(X_test):,} rows", flush=True)

Splitting...
Train: 1,600,000 rows | Test: 400,000 rows


In [18]:
print("Training... (this may take a while)", flush=True)
start = time.time()
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
print(f"Training done in {time.time() - start:.1f}s", flush=True)

Training... (this may take a while)
Training done in 36.7s


In [19]:
print(f"Accuracy: {clf.score(X_test, y_test):.4f}")
print(classification_report(y_test, clf.predict(X_test)))

Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    399927
           1       1.00      1.00      1.00        73

    accuracy                           1.00    400000
   macro avg       1.00      1.00      1.00    400000
weighted avg       1.00      1.00      1.00    400000



In [20]:
clf.predict([X_train[99]])

array([0])

In [21]:
clf.predict([[0.5, -0.3, 1.2, 0.0, -0.8, 0.4, -1.1, 0.7, 0.2]])

array([0])

In [22]:
import joblib

joblib.dump(clf, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model saved!", flush=True)

Model saved!
