### Import necessary libraries

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

In [2]:
df_clean = pd.read_csv('df_typos.csv',index_col = 0)

In [3]:
df_clean.head()

Unnamed: 0,text,generated,text_length,mean_word_length,sentences,sentence_length,mean_sentence,max_word_length,unique_word_count,proper_noun_count,number_count,text_with_typos_replaced
0,Car-free cities have become a subject of incre...,1,4091,6.083478,['Car-free cities have become a subject of inc...,28,144.607143,19,268,29,0,Car-free cities have become a subject of incre...
1,"Car Free Cities Car-free cities, a concept ga...",1,3757,6.298246,"['Car Free Cities Car-free cities, a concept ...",26,143.115385,15,266,9,0,"Car Free Cities Car-free cities , a concept ga..."
2,A Sustainable Urban Future Car-free cities ...,1,3828,6.445312,[' A Sustainable Urban Future Car-free citie...,27,140.37037,15,252,12,0,A Sustainable Urban Future Car-free cities are...
3,Pioneering Sustainable Urban Living In an e...,1,3739,6.175337,[' Pioneering Sustainable Urban Living In an...,23,161.130435,15,257,10,0,Pioneering Sustainable Urban Living In an era ...
4,The Path to Sustainable Urban Living In an ...,1,3698,6.207436,[' The Path to Sustainable Urban Living In a...,22,166.636364,15,246,16,0,The Path to Sustainable Urban Living In an age...


In [4]:
df_clean.columns

Index(['text', 'generated', 'text_length', 'mean_word_length', 'sentences',
       'sentence_length', 'mean_sentence', 'max_word_length',
       'unique_word_count', 'proper_noun_count', 'number_count',
       'text_with_typos_replaced'],
      dtype='object')

In [12]:
# Define features
text_col = 'text_with_typos_replaced'
numeric_cols = ['text_length', 'mean_word_length',
                'sentence_length', 'mean_sentence', 'unique_word_count',
                'proper_noun_count', 'number_count']

In [13]:
# Features and target
X = df_clean[[text_col] + numeric_cols]
y = df_clean['generated']

In [14]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### MultinomialNB Classifier

In [15]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), text_col),
        ('numeric', 'passthrough', numeric_cols)  # keep numeric columns as-is
    ]
)

In [16]:
# Full pipeline
model = Pipeline([
    ('preprocess', preprocessor),
    ('clf', MultinomialNB())
])

In [17]:
# Train
model.fit(X_train, y_train)

In [18]:
# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7665122662549322
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.78      0.80      3539
           1       0.68      0.75      0.72      2290

    accuracy                           0.77      5829
   macro avg       0.76      0.76      0.76      5829
weighted avg       0.77      0.77      0.77      5829



In [21]:
# Run 3-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=3, scoring='accuracy')

# Output results
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))
print("Standard Deviation:", np.std(cv_scores))

Cross-Validation Accuracy Scores: [0.70272774 0.65414308 0.7641791 ]
Mean Accuracy: 0.7070166409332647
Standard Deviation: 0.045024273423247246


### Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [27]:
# Run 3-fold cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=3, scoring='accuracy')

# Output results
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))
print("Standard Deviation:", np.std(cv_scores))

Cross-Validation Accuracy Scores: [0.82933608 0.92784354 0.89799279]
Mean Accuracy: 0.8850574712643677
Standard Deviation: 0.04124255079813475


### XGBoost Classifier

In [28]:
from xgboost import XGBClassifier

xgb_model = Pipeline([
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

In [29]:
# Run 3-fold cross-validation
cv_scores = cross_val_score(xgb_model, X, y, cv=3, scoring='accuracy')

# Output results
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))
print("Standard Deviation:", np.std(cv_scores))

Cross-Validation Accuracy Scores: [0.84940813 0.90602162 0.80195574]
Mean Accuracy: 0.852461828787099
Standard Deviation: 0.042539554257206354
