In [23]:
import pandas as pd
import pickle
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer  # Added for text processing
from sklearn.metrics import accuracy_score, classification_report  # Updated metrics
from math import sqrt

In [24]:
df = pd.read_csv("splits/train.csv")
y = df["Target"] 
X = df["Message"]

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
text_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),  
])

In [27]:
pipeline = Pipeline([
    ("text_processor", text_pipeline),
    ("xgb", XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss'))
])

In [28]:
param_grid = {
    "xgb__n_estimators": [200, 400, 600],
    "xgb__max_depth": [3, 6, 9],
    "xgb__learning_rate": [0.01, 0.05, 0.1],
    "xgb__subsample": [0.6, 0.8, 1.0],
    "xgb__colsample_bytree": [0.4, 0.6, 0.8],
    "xgb__reg_alpha": [0, 0.1, 1],
    "xgb__reg_lambda": [1, 1.5, 2]
}


In [None]:
search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=20,
                          cv=5, verbose=1, n_jobs=-1, scoring="f1") 

search.fit(X_train, y_train)
best_model = search.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [30]:
y_pred = best_model.predict(X_val)
print(classification_report(y_val, y_pred))
print("Best parameters:", search.best_params_)

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       772
           1       0.96      0.88      0.92       120

    accuracy                           0.98       892
   macro avg       0.97      0.94      0.96       892
weighted avg       0.98      0.98      0.98       892

Best parameters: {'xgb__subsample': 0.8, 'xgb__reg_lambda': 1.5, 'xgb__reg_alpha': 0, 'xgb__n_estimators': 400, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.05, 'xgb__colsample_bytree': 0.6}


In [31]:
import os
os.makedirs("models", exist_ok=True)
pickle.dump(best_model, open("models/xgboost_model.pkl", "wb"))
print("✓ Best model saved as xgboost_model.pkl")

✓ Best model saved as xgboost_model.pkl
