In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Load dataset
data = pd.read_excel('../data/training_data/AI_vs_human_train_dataset.xlsx')
data.dropna(inplace=True)

# Preprocessing
X = data['essay']
y = data['label']  # 0 = Human, 1 = AI

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

# Save vectorizer
with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


# SVM
svm = SVC(probability=True)
svm.fit(X_train, y_train)
with open('../models/svm_model.pkl', 'wb') as f:
    pickle.dump(svm, f)

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
with open('../models/decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt, f)

# AdaBoost
ab = AdaBoostClassifier(n_estimators=100)
ab.fit(X_train, y_train)
with open('../models/adaboost_model.pkl', 'wb') as f:
    pickle.dump(ab, f)


# Evaluation
models = {'SVM': svm, 'Decision Tree': dt, 'AdaBoost': ab}
for name, model in models.items():
    print(f"\n{name} Performance:")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))



SVM Performance:
              precision    recall  f1-score   support

           0       0.96      0.93      0.95       393
           1       0.93      0.96      0.94       353

    accuracy                           0.95       746
   macro avg       0.95      0.95      0.95       746
weighted avg       0.95      0.95      0.95       746

ROC AUC: 0.9901570688176229

Decision Tree Performance:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       393
           1       0.87      0.91      0.89       353

    accuracy                           0.89       746
   macro avg       0.89      0.89      0.89       746
weighted avg       0.89      0.89      0.89       746

ROC AUC: 0.8948777833041397

AdaBoost Performance:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       393
           1       0.94      0.97      0.95       353

    accuracy                           0.96       746
   