In [17]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [18]:
# Load data
training_data = pd.read_csv("train_essays.csv")
test_data = pd.read_csv("test_essays.csv")

In [19]:
# Define a custom text preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization, stopwords removal, and other steps are already covered by TfidfVectorizer
    
    return text

In [20]:
# Split the training data into features and labels
X_train, X_validation, y_train, y_validation = train_test_split(training_data['text'], training_data['generated'], test_size=0.2, random_state=42)

In [21]:
# Create a pipeline with TF-IDF vectorizer and XGBoost classifier
vectorizer_xgb = TfidfVectorizer(preprocessor=preprocess_text)
X_train_tfidf_xgb = vectorizer_xgb.fit_transform(X_train)
X_validation_tfidf_xgb = vectorizer_xgb.transform(X_validation)

In [22]:
# Create XGBoost classifier
xgb_model = XGBClassifier()

# Train the model
xgb_model.fit(X_train_tfidf_xgb, y_train)

In [23]:
# Hyperparameter tuning using GridSearchCV
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
}

grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train_tfidf_xgb, y_train)

# Get the best hyperparameters
best_hyperparameters_xgb = grid_search_xgb.best_params_
print("Best Hyperparameters for XGBoost:", best_hyperparameters_xgb)



Best Hyperparameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.6}


In [29]:
# Validate the model on the validation set
from sklearn.metrics import classification_report
y_validation_pred_xgb = grid_search_xgb.predict(X_validation_tfidf_xgb)
print(classification_report(y_validation, y_validation_pred_xgb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       0.00      0.00      0.00         1

    accuracy                           1.00       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      1.00      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# Validate the model on the validation set
y_validation_pred_xgb = grid_search_xgb.predict(X_validation_tfidf_xgb)
accuracy_xgb = accuracy_score(y_validation, y_validation_pred_xgb)
print("XGBoost Accuracy on Validation Set:", accuracy_xgb)

XGBoost Accuracy on Validation Set: 0.9963768115942029


In [26]:
# Make predictions on the test set
X_test_tfidf_xgb = vectorizer_xgb.transform(test_data['text'])
test_predictions_xgb = grid_search_xgb.predict_proba(X_test_tfidf_xgb)[:, 1]

# Create a submission file for XGBoost
submission_data_xgb = pd.DataFrame({'id': test_data['id'], 'generated': test_predictions_xgb})
submission_data_xgb.to_csv("submission_xgboost.csv", index=False)