In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from hyperopt import fmin, hp, tpe


In [None]:

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Mount Google Drive (for Google Colab usage)
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Load the dataset
equal_sample_df = pd.read_csv('path_to_data')

# Check initial class distribution
print("Initial class distribution:")
print(equal_sample_df['source'].value_counts())


In [None]:

# Create balanced dataset
# Separate the data by label
human_data = equal_sample_df[equal_sample_df["source"] == 0]
ai_data = equal_sample_df[equal_sample_df["source"] == 1]

# Sample equal amounts from each class (20,000 samples each)
human_sampled = human_data.sample(n=20000, random_state=42)
ai_data = ai_data.sample(n=20000, random_state=42)

# Combine the downsampled human data with the AI data
equal_sample_df = pd.concat([human_sampled, ai_data], ignore_index=True)

# Shuffle the dataset for randomness
equal_sample_df = equal_sample_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
# Verify balanced classes
print("Balanced class distribution:")
print(equal_sample_df['source'].value_counts())

# Split data into training and testing sets (70/30 split)
X = equal_sample_df  # Features
y = equal_sample_df['source']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=72, stratify=y
)

# Report split sizes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:

# Feature extraction methods

# 1. Bag of Words approach
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train['text'])
X_test_bow = vectorizer.transform(X_test['text'])

# 2. TF-IDF approach
tfidf_vectorizer = TfidfVectorizer()
X_train_bow = tfidf_vectorizer.fit_transform(X_train['text'])
X_test_bow = tfidf_vectorizer.transform(X_test['text'])

# Initial Random Forest model with default parameters
rf_model = RandomForestClassifier(n_estimators=400, random_state=42)
rf_model.fit(X_train_bow, y_train)
y_pred = rf_model.predict(X_test_bow)


In [None]:

# Evaluate initial model
print("Initial Random Forest Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:

# Hyperparameter optimization functions
def rf_objective(params):
    """Objective function for hyperparameter optimization"""
    rf_model = RandomForestClassifier(**params)
    rf_model.fit(X_train_bow, y_train)
    y_pred = rf_model.predict(X_test_bow)
    accuracy = accuracy_score(y_test, y_pred)
    return -accuracy  # Negative because we want to maximize accuracy

# Define hyperparameter search space
space = {
    'n_estimators': hp.choice('n_estimators', range(100, 1000)),
    'max_depth': hp.choice('max_depth', range(1, 10)),
    'criterion': hp.choice('criterion', ['gini', 'entropy', 'log_loss']),
    'max_features': hp.choice('max_features', ['sqrt', 'log2']),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
}


In [None]:

# Use Hyperopt to find the best hyperparameters
best_rf = fmin(fn=rf_objective, space=space, algo=tpe.suggest, max_evals=50)

# Map the hyperopt choices back to their actual values
criterion_options = ['gini', 'entropy', 'log_loss']
best_rf['criterion'] = criterion_options[best_rf['criterion']]

max_features_options = ['sqrt', 'log2']
best_rf['max_features'] = max_features_options[best_rf['max_features']]

min_samples_split_options = [2, 5, 10]
best_rf['min_samples_split'] = min_samples_split_options[best_rf['min_samples_split']]

print("Best Random Forest Hyperparameters:", best_rf)


In [None]:

# Train the final Random Forest model with the optimized hyperparameters
rf_model_final = RandomForestClassifier(**best_rf)
rf_model_final.fit(X_train_bow, y_train)
rf_model_final_predictions = rf_model_final.predict(X_test_bow)


In [None]:

# Evaluate the final model
print("Optimized Random Forest Model Performance:")
print("Accuracy:", accuracy_score(y_test, rf_model_final_predictions))
print("\nClassification Report:\n", classification_report(y_test, rf_model_final_predictions))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, rf_model_final_predictions))