In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Data Preparation
data = pd.read_csv('AI_Human.csv')

df = pd.DataFrame(data)

# Step 2: Feature Extraction using TF-IDF (Bigrams)
vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=1000)
X = vectorizer.fit_transform(df['text']).toarray()
y = df['generated']

# Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Model Training
# Train RandomForest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Train LightGBM
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)

# Step 5: Model Evaluation
# Predict with RandomForest
rf_probs = rf_model.predict_proba(X_test)[:, 1]
rf_predictions = rf_model.predict(X_test)

# Predict with LightGBM
lgb_probs = lgb_model.predict_proba(X_test)[:, 1]
lgb_predictions = lgb_model.predict(X_test)

# Combine predictions (e.g., averaging probabilities)
combined_probs = (rf_probs + lgb_probs) / 2
combined_predictions = (combined_probs > 0.5).astype(int)

# Print Evaluation Metrics
print("RandomForest Accuracy:", accuracy_score(y_test, rf_predictions))
print("RandomForest Classification Report:\n", classification_report(y_test, rf_predictions))

print("LightGBM Accuracy:", accuracy_score(y_test, lgb_predictions))
print("LightGBM Classification Report:\n", classification_report(y_test, lgb_predictions))

print("Combined Model Accuracy:", accuracy_score(y_test, combined_predictions))
print("Combined Model Classification Report:\n", classification_report(y_test, combined_predictions))

# Step 6: Make Prediction
# Predict with RandomForest
new_text_vector = vectorizer.transform(["Artificial Intelligence is going to be the supreme concern in few decades."]).toarray()
rf_new_text_probs = rf_model.predict_proba(new_text_vector)[:, 1]
rf_new_text_prediction = rf_model.predict(new_text_vector)

# Predict with LightGBM
lgb_new_text_probs = lgb_model.predict_proba(new_text_vector)[:, 1]
lgb_new_text_prediction = lgb_model.predict(new_text_vector)

# Combine predictions
combined_new_text_probs = (rf_new_text_probs + lgb_new_text_probs) / 2
combined_new_text_prediction = (combined_new_text_probs > 0.5).astype(int)

print("RandomForest New Text Probabilities:", rf_new_text_probs)
print("RandomForest New Text Prediction (0 = Human, 1 = AI):", rf_new_text_prediction[0])

print("LightGBM New Text Probabilities:", lgb_new_text_probs)
print("LightGBM New Text Prediction (0 = Human, 1 = AI):", lgb_new_text_prediction[0])

print("Combined New Text Probabilities:", combined_new_text_probs)
print("Combined New Text Prediction (0 = Human, 1 = AI):", combined_new_text_prediction[0])


[LightGBM] [Info] Number of positive: 126864, number of negative: 214200
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.666554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 341064, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.371965 -> initscore=-0.523795
[LightGBM] [Info] Start training from score -0.523795
RandomForest Accuracy: 0.9962920141478132
RandomForest Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     91597
         1.0       1.00      0.99      1.00     54574

    accuracy                           1.00    146171
   macro avg       1.00      1.00      1.00    146171
weighted avg       1.00      1.00      1.00    146171

LightGBM Accuracy: 0.98051597