# Generating a Ensemble Model with a TF-IDF Feature Set

In [5]:
import os
import re
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shaemckenna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shaemckenna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Import Training Data

In [7]:
training_data = "../data/train_long_df.csv"
train_df = pd.read_csv(training_data, dtype={'folder': int, 'text': str, 'is_real': int})
train_df.head()

Unnamed: 0,folder,text,is_real
0,0,China\nThe goal of this project involves achie...,1
1,0,The project aims to achieve an accuracy level ...,0
2,1,Scientists can learn about how galaxies form a...,0
3,1,Dinosaur eggshells offer clues about what dino...,1
4,2,China\nThe study suggests that multiple star s...,1


### Clean Training Data

In [8]:
clean_df = train_df.copy()
print(f"Number of samples before data cleaning: {len(clean_df)}")

# Removing NA rows altogether 
clean_df.dropna(subset=["text"], inplace=True)
print(f"Number of samples before data cleaning: {len(clean_df)}")

Number of samples before data cleaning: 186
Number of samples before data cleaning: 184


### Feature Engineering

In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punct_table = str.maketrans('', '', string.punctuation)

def preprocess(text):
    text = text.lower()
    text = text.translate(punct_table)
    tokens = [lemmatizer.lemmatize(word)
                for word in text.split()
                if word not in stop_words]

    return ' '.join(tokens)

clean_df['clean_text'] = clean_df['text'].apply(preprocess)
clean_df['text_length'] = clean_df['text'].apply(len)
clean_df['word_count'] = clean_df['text'].apply(lambda x: len(x.split()))
clean_df['avg_word_length'] = clean_df['text'].apply(
    lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)

vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),
    max_features=10000,
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

X_tfidf = vectorizer.fit_transform(clean_df['clean_text'])

### Model Training