In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from imblearn.over_sampling import SMOTE
import pickle


In [None]:
print("Training the model...")

# Load dataset (ensure y is binary: 0 or 1)
df = pd.read_csv("Combined_dataset.csv")
df.head()


In [None]:
X = df['Query']
y = df['Label']

In [None]:
 # Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# Handle class imbalance using SMOTE
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_transformed, y_train)

In [None]:
 # Save vectorizer
with open('vectorizer.pkl', 'wb') as f:
   pickle.dump(vectorizer, f)

# Define base learners
gbm = GradientBoostingClassifier(n_estimators=100)
adaboost = AdaBoostClassifier(n_estimators=100)
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
lgbm = LGBMClassifier(n_estimators=100)


In [None]:
# Stacking Classifier
stacking_clf = StackingClassifier(
   estimators=[
      ('gbm', gbm),
      ('adaboost', adaboost),
      ('xgb', xgb),
      ('lgbm', lgbm)
    ],
    final_estimator=LogisticRegression(solver='liblinear'),
    stack_method='predict_proba'
)

stacking_clf.fit(X_resampled, y_resampled)

In [None]:
# Save model
with open('model.pkl', 'wb') as f:
  pickle.dump(stacking_clf, f)

In [None]:
# Evaluate model
X_test_transformed = vectorizer.transform(X_test)
y_pred = stacking_clf.predict(X_test_transformed)
y_pred_proba = stacking_clf.predict_proba(X_test_transformed)[:, 1]

In [None]:
  accuracy = accuracy_score(y_test, y_pred)
  roc_auc = roc_auc_score(y_test, y_pred_proba)
  f1 = f1_score(y_test, y_pred)

  print(f"Accuracy: {accuracy:.4f}")
  print(f"ROC AUC: {roc_auc:.4f}")
  print(f"F1 Score: {f1:.4f}")