In [3]:
# Load full dataset
import pandas as pd
df = pd.read_csv("../data/processed_dataset.csv")


In [4]:
# Extract stylometric features and scale them
from sklearn.preprocessing import StandardScaler

stylometric_features = df[["word_count", "sentence_count", "avg_word_length", "punctuation_count"]]
stylometric_scaled = StandardScaler().fit_transform(stylometric_features)

In [21]:
# Combine TF IDF and stylometic features
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

vectorizer = TfidfVectorizer(max_features=300, stop_words="english")
X_tfidf = vectorizer.fit_transform(df["Text"])

# Combine dense stylometric features with sparse TF-IDF matrix
from scipy.sparse import csr_matrix
X_combined = hstack([X_tfidf, csr_matrix(stylometric_scaled)])


In [22]:
# Train a random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X_combined, df["Label"], test_size=0.2, stratify=df["Label"], random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[10  0]
 [ 1  9]]
              precision    recall  f1-score   support

          AI       0.91      1.00      0.95        10
       Human       1.00      0.90      0.95        10

    accuracy                           0.95        20
   macro avg       0.95      0.95      0.95        20
weighted avg       0.95      0.95      0.95        20



Validating the Results

In [23]:
# 5 fold cross validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_combined, df["Label"], cv=5)
print(f"CV Accuracy scores: {scores}")
print(f"Mean CV accuracy: {scores.mean():.2f}")


CV Accuracy scores: [0.9 1.  1.  1.  1. ]
Mean CV accuracy: 0.98


In [24]:
# Observe feature importances
import numpy as np

importances = model.feature_importances_
top_idx = np.argsort(importances)[-10:]

for i in top_idx:
    print(f"{i}: {importances[i]:.4f}")

# This will tell us what exactly the model is relying on
# It may be overly depending on word count or a specific TF-IDF token


280: 0.0108
66: 0.0127
95: 0.0129
298: 0.0159
81: 0.0160
144: 0.0351
127: 0.0448
301: 0.1153
303: 0.1515
300: 0.1705


## TF-IDF + Stylometric Feature Fusion Model (Random Forest)

To improve baseline performance, we combined semantic (TF-IDF) and stylometric (text structure) features, then trained a Random Forest classifier.

---

### Model Details
- **TF-IDF**: 300 max features, stopwords removed
- **Stylometric Features**: Word count, sentence count, punctuation count, avg word length
- **Classifier**: Random Forest (100 trees)
- **Evaluation**: 80/20 split + 5-fold cross-validation

---

### Performance

| Metric      | Value  |
|-------------|--------|
| Accuracy    | 95%    |
| Macro F1    | 0.95   |
| CV Accuracy | 98% (avg across 5 folds) |

The fusion model performed significantly better than TF-IDF alone, showing that combining structural cues with semantic features improves classifier generalizability.

---

### Insights
- Human responses show greater stylistic variety (length, punctuation)
- AI responses tend to be shorter, more templated, and structurally balanced
- The model may be picking up on both word usage and layout/structure patterns

