# Generating a Ensemble Model with a TF-IDF Feature Set

In [5]:
import os
import re
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shaemckenna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shaemckenna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Import Training Data

In [7]:
training_data = "../data/train_long_df.csv"
train_df = pd.read_csv(training_data, dtype={'folder': int, 'text': str, 'is_real': int})
train_df.head()

Unnamed: 0,folder,text,is_real
0,0,China\nThe goal of this project involves achie...,1
1,0,The project aims to achieve an accuracy level ...,0
2,1,Scientists can learn about how galaxies form a...,0
3,1,Dinosaur eggshells offer clues about what dino...,1
4,2,China\nThe study suggests that multiple star s...,1


### Clean Training Data

In [8]:
clean_df = train_df.copy()
print(f"Number of samples before data cleaning: {len(clean_df)}")

# Removing NA rows altogether 
clean_df.dropna(subset=["text"], inplace=True)
print(f"Number of samples before data cleaning: {len(clean_df)}")

Number of samples before data cleaning: 186
Number of samples before data cleaning: 184


### Feature Engineering

In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punct_table = str.maketrans('', '', string.punctuation)

def preprocess(text):
    text = text.lower()
    text = text.translate(punct_table)
    tokens = [lemmatizer.lemmatize(word)
                for word in text.split()
                if word not in stop_words]

    return ' '.join(tokens)

clean_df['clean_text'] = clean_df['text'].apply(preprocess)
clean_df['text_length'] = clean_df['text'].apply(len)
clean_df['word_count'] = clean_df['text'].apply(lambda x: len(x.split()))
clean_df['avg_word_length'] = clean_df['text'].apply(
    lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)

vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=10000,
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_tfidf = vectorizer.fit_transform(clean_df['clean_text'])

### Model Training

In [14]:
X = X_tfidf
y = clean_df["is_real"].astype(int)

X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

models = {
    'LogisticRegression': LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        C=0.1,
        solver='saga',
        penalty='elasticnet',
        l1_ratio=0.5
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        class_weight='balanced_subsample',
        random_state=42
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=150,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        random_state=42
    ),
    'SVM': CalibratedClassifierCV(
        SVC(
            kernel='rbf',
            C=1.0,
            gamma='scale',
            class_weight='balanced',
            probability=True
        ),
        cv=3
    )
}

trained_models = {}
val_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_res, y_train_res)
    trained_models[name] = model


    val_preds = model.predict(X_val)
    acc = accuracy_score(y_val, val_preds)
    f1 = f1_score(y_val, val_preds)
    val_scores[name] = {'accuracy': acc, 'f1': f1}

    print(f"{name} Validation Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    print(classification_report(y_val, val_preds))


voting_clf = VotingClassifier(
    estimators=[(name, model) for name, model in trained_models.items()],
    voting='soft',
    n_jobs=-1
)
voting_clf.fit(X_train_res, y_train_res)
trained_models['Ensemble'] = voting_clf


val_preds = voting_clf.predict(X_val)
acc = accuracy_score(y_val, val_preds)
f1 = f1_score(y_val, val_preds)
val_scores['Ensemble'] = {'accuracy': acc, 'f1': f1}
print(f"\nEnsemble Validation Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
print(classification_report(y_val, val_preds))


Training LogisticRegression...
LogisticRegression Validation Accuracy: 0.5135, F1 Score: 0.0000
              precision    recall  f1-score   support

           0       0.51      1.00      0.68        19
           1       0.00      0.00      0.00        18

    accuracy                           0.51        37
   macro avg       0.26      0.50      0.34        37
weighted avg       0.26      0.51      0.35        37


Training RandomForest...
RandomForest Validation Accuracy: 0.2703, F1 Score: 0.1290
              precision    recall  f1-score   support

           0       0.33      0.42      0.37        19
           1       0.15      0.11      0.13        18

    accuracy                           0.27        37
   macro avg       0.24      0.27      0.25        37
weighted avg       0.25      0.27      0.25        37


Training GradientBoosting...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


GradientBoosting Validation Accuracy: 0.3784, F1 Score: 0.3030
              precision    recall  f1-score   support

           0       0.41      0.47      0.44        19
           1       0.33      0.28      0.30        18

    accuracy                           0.38        37
   macro avg       0.37      0.38      0.37        37
weighted avg       0.37      0.38      0.37        37


Training SVM...
SVM Validation Accuracy: 0.7297, F1 Score: 0.7727
              precision    recall  f1-score   support

           0       0.91      0.53      0.67        19
           1       0.65      0.94      0.77        18

    accuracy                           0.73        37
   macro avg       0.78      0.74      0.72        37
weighted avg       0.78      0.73      0.72        37


Ensemble Validation Accuracy: 0.3243, F1 Score: 0.2424
              precision    recall  f1-score   support

           0       0.36      0.42      0.39        19
           1       0.27      0.22      0.24        