# Project | Natural Language Processing Challenge
### Fake news classifier

Environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

"""
import nltk
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
"""


### Load train and test data


In [None]:
train_data_raw = pd.read_csv(
"training_data_lowercase.csv",
sep="\t",
header=None,
names=["label", "text"]
)
test_data_raw = pd.read_csv(
"testing_data_lowercase_nolabels.csv",
sep="\t",
header=None,
names=["label", "text"]
)

### quick EDA

In [None]:
# preview
display(train_data_raw.head())

# shapes
print(
    f"Training data shape (rows, columns): {train_data_raw.shape}\n"
    f"Test data shape (rows, columns): {test_data_raw.shape}"
)


# fake news / real news balance
print("\nFake news / real news balance:")
print(train_data_raw["label"].value_counts(normalize=True))

# missing values
print("\nMissing values per column:")
print(train_data_raw.isnull().sum())

# empty text check
empty_texts = (train_data_raw["text"].str.strip() == "").sum()
print(f"\nNumber of empty text entries: {empty_texts}")


### Training - validation split

In [None]:
X = train_data_raw["text"]
y = train_data_raw["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

### Data cleaning

In [None]:
# removing only special characters and empty spaces
def clean_text(text):
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

X_train_clean = X_train.apply(clean_text)
X_test_clean  = X_test.apply(clean_text)

Skipping lemmatization on this model. Potential thing to try on next ones.
We chose not to apply lemmatization because:
- Anticipated a low impact based on the nature of the dataset. Not worth the cost
- We could lose nuance in text relevant to fake news style
- Lower impact of lemmatization on TF-IDF 

### Vectorization

In [None]:
# using TF-IDF. Intuitively better suited for fake news detection (emphasis on presence of certain sifnigicant words rather than accumulation of certain words)
tfidf_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    max_features=20000
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_clean)
X_test_tfidf  = tfidf_vectorizer.transform(X_test_clean)

print(f"TF-IDF train shape: {X_train_tfidf.shape}")
print(f"TF-IDF test shape: {X_test_tfidf.shape}")

### Model training

In [None]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_tfidf, y_train)


### Evaluation

In [None]:
y_pred = model_rf.predict(X_test_tfidf)

y_train_pred = model_rf.predict(X_train_tfidf)

print("Train accuracy:", accuracy_score(y_train, y_train_pred))
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Classification report:")
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt

models = ["Logistic Regression", "Random Forest"]
test_accuracies = [0.9335, 0.9223]  # replace with your exact values

plt.figure(figsize=(6,4))
plt.bar(models, test_accuracies)
plt.ylim(0.85, 1.0)
plt.ylabel("Test Accuracy")
plt.title("Model Comparison (TF-IDF Features)")

for i, acc in enumerate(test_accuracies):
    plt.text(i, acc + 0.005, f"{acc:.3f}", ha="center")

plt.show()