# Project | Natural Language Processing Challenge
### Fake news classifier

Environment

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rache\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Load train and test data


In [None]:
# 1. Loading the training dataset
# The file is in TSV (Tab-Separated Values) format, requiring the use of sep="\t".
# Since the raw file lacks a header row, we manually map the column names.
train_data_raw = pd.read_csv(
    "../data/training_data_lowercase.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)

# 2. Loading the unlabeled test dataset
# This dataset follows the same structure and will be used to generate final predictions.
test_data_raw = pd.read_csv(
    "../data/testing_data_lowercase_nolabels.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)

quick EDA

In [None]:
# 1. Data Preview
# Displaying the first few rows to verify the data structure and content.
display(train_data_raw.head())

# 2. Dataset Dimensions
# Checking the total number of samples and features in both sets.
print(
    f"Training dataset dimensions (rows, columns): {train_data_raw.shape}\n"
    f"Test dataset dimensions (rows, columns): {test_data_raw.shape}"
)

# 3. Class Distribution (Label Balance)
# Analyzing the ratio of 'Fake' vs 'Real' news to check for potential class imbalance.
print("\nTarget class distribution (Normalized):")
print(train_data_raw["label"].value_counts(normalize=True))

# 4. Missing Value Analysis
# Identifying null values across all columns to ensure data integrity.
print("\nMissing values per feature:")
print(train_data_raw.isnull().sum())

# 5. Empty String Check
# Specifically checking for entries that contain only whitespace, which could affect tokenization.
empty_texts = (train_data_raw["text"].str.strip() == "").sum()
print(f"\nNumber of empty text entries detected: {empty_texts}")


Unnamed: 0,label,text
0,0,donald trump sends out embarrassing new yearâ€šs...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obamaâ€šs name ...
4,0,pope francis just called out donald trump duri...


Training data shape (rows, columns): (34152, 2)
Test data shape (rows, columns): (9984, 2)

Fake news / real news balance:
label
0    0.514523
1    0.485477
Name: proportion, dtype: float64

Missing values per column:
label    0
text     0
dtype: int64

Number of empty text entries: 0


Training - validation split

In [None]:
# 1. Separate the features (X) and the target variable (y)
# 'X' contains the independent variable (text), and 'y' contains the labels we want to predict.
X = train_data_raw["text"]
y = train_data_raw["label"]

# 2. Split the dataset into Training and Testing sets
# test_size=0.2: Allocates 20% of the data for evaluation and 80% for training.
# random_state=42: Set for reproducibility, ensuring the same split every time the code runs.
# stratify=y: Performs a stratified split to ensure the class distribution (Fake vs Real) 
# remains identical in both the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

Data cleaning

In [None]:
# Function to normalize text by removing special characters and excess whitespace
def clean_text(text):
    # Use Regular Expressions (re) to keep only lowercase letters (a-z) and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    # Replace multiple spaces with a single space and remove leading/trailing whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply the cleaning function to the training and testing datasets
X_train_clean = X_train.apply(clean_text)
X_test_clean  = X_test.apply(clean_text)

vectorization word2vec 
Word Embeddings vectorization

In [36]:
!pip install gensim




[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: C:\Users\rache\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


In [37]:
from gensim.models import Word2Vec
import numpy as np

# 1. Word2Vec requires a list of words (tokens) rather than a full string/sentence.
# We split the cleaned sentences into lists of words.
# 'str' conversion prevents crashes from empty rows (NaN) or numeric data, 
# ensuring consistent string input for the Word2Vec tokenizer.
train_tokens = [str(text).split() for text in X_train_clean]
test_tokens  = [str(text).split() for text in X_test_clean]

# 2. Training the Word2Vec model
# vector_size=100: each word will be represented by 100 numbers (dimensions)
# This part runs on CPU but prepares data for any type of model later.
w2v_model = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=1, workers=4)

print("Word2Vec vectorization model trained successfully!")

# 3. Function to create a single vector for a whole sentence (Average of word vectors)
# This averages all word vectors in a sentence to create a fixed-size input.
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(100) # Returns a vector of zeros if no words are found in the vocabulary
    return np.mean(vectors, axis=0)

# 4. Transform token lists into numerical matrices
# These matrices (X_train_w2v, X_test_w2v) are compatible with BOTH CPU and GPU models.
X_train_w2v = np.array([get_sentence_vector(t, w2v_model) for t in train_tokens])
X_test_w2v  = np.array([get_sentence_vector(t, w2v_model) for t in test_tokens])

print("Data successfully vectorized and ready for model training (CPU or GPU)!")

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


Word2Vec vectorization model trained successfully!
Data successfully vectorized and ready for model training (CPU or GPU)!


In [38]:
!pip install xgboost




[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: C:\Users\rache\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


Model training xgboost

In [None]:
from xgboost import XGBClassifier
import torch

# 1. Hardware acceleration detection
# Checks for GPU availability to speed up the boosting process.
if torch.cuda.is_available():
    device_method = 'cuda' 
    print(" GPU detected! Using 'cuda' for high-speed training.")
else:
    device_method = 'hist'
    print(" No GPU found. Using 'hist' on CPU.")

# 2. Optimized XGBoost Configuration
# We use a lower learning rate paired with a sufficient number of estimators
# to improve the model's generalization and precision.
model = XGBClassifier(
    device=device_method, 
    n_estimators=100,      # Number of boosting rounds (trees to build)
    learning_rate=0.02,    # Reduced step size shrinkage to prevent overfitting
    max_depth=6,           # Maximum depth of a tree to capture complex patterns
    random_state=42        # Ensures reproducibility of results
)

# 3. Model Training
# Fitting the model using the Word2Vec sentence embeddings.
model.fit(X_train_w2v, y_train)

# 4. Evaluation
accuracy = model.score(X_test_w2v, y_test)
print(f" Training complete! My estimated accuracy: {accuracy * 100:.2f}%")

ðŸš€ GPU detected! Using 'cuda' for high-speed training.
âœ… Training complete! My estimated accuracy: 90.47%


the final validation result for xgboost 

In [40]:
import pandas as pd
import numpy as np

# 1. Load the "unlabeled" dataset provided for the challenge
# This file contains the '2' labels that we need to predict
final_val_df = pd.read_csv("../data/testing_data_lowercase_nolabels.csv", sep='\t', header=None)

# 2. Text Preprocessing & Vectorization
# We apply the same cleaning function used during training to ensure consistency
print("Step 1: Cleaning validation text")
val_texts_cleaned = final_val_df[1].apply(clean_text)

# Convert sentences into lists of tokens (words)
val_tokens = [str(t).split() for t in val_texts_cleaned]

# Convert tokens into numerical vectors using our trained Word2Vec model
print("Step 2: Vectorizing with Word2Vec")
X_final_val_w2v = np.array([get_sentence_vector(t, w2v_model) for t in val_tokens])

# 3. Model Prediction
print("Step 3: Running XGBoost prediction")
final_predictions = model.predict(X_final_val_w2v)

# 4. Final Formatting
# Replace the placeholder '2' with our model's predictions (0 or 1)
final_val_df[0] = final_predictions

# 5. Export results
# The output format must be tab-separated, no header, and no index
final_val_df.to_csv("validation_results_xgboost.csv", sep='\t', index=False, header=False)

print("The file 'validation_results_xgboost.csv' has been generated, that's the final file.")

Step 1: Cleaning validation text
Step 2: Vectorizing with Word2Vec
Step 3: Running XGBoost prediction
The file 'validation_results_xgboost.csv' has been generated, that's the final file.


In [41]:
# Accuracy on data the model learned from
xgb_train_acc = model.score(X_train_w2v, y_train)


xgb_test_acc = model.score(X_test_w2v, y_test)

print(f"--- XGBoost (Word2Vec) ---")
print(f"Training Accuracy: {xgb_train_acc * 100:.2f}%")
print(f"Testing Accuracy: {xgb_test_acc * 100:.2f}%")

--- XGBoost (Word2Vec) ---
Training Accuracy: 94.39%
Testing Accuracy: 90.47%


In [None]:
from xgboost import XGBClassifier
import torch

# 1. New way to detect GPU for XGBoost 2.0+
# In newer versions, we use 'cuda' instead of 'gpu_hist'
if torch.cuda.is_available():
    device_method = 'cuda' 
    print("ðŸš€ GPU detected! Using 'cuda' for high-speed training.")
else:
    device_method = 'hist'
    print("ðŸ’» No GPU found. Using 'hist' on CPU.")

# 2. Initializing XGBoost
model = XGBClassifier(
    device=device_method, 
    n_estimators=100,      # On augmente beaucoup le nombre d'itÃ©rations
    learning_rate=0.02,     # On rÃ©duit le taux pour Ãªtre trÃ¨s prÃ©cis
    max_depth=6,            # On permet au modÃ¨le d'analyser des relations plus complexes
    random_state=42
)

# 3. Training
model.fit(X_train_w2v, y_train)

# 4. Accuracy
accuracy = model.score(X_test_w2v, y_test)
print(f"âœ… Training complete! My estimated accuracy: {accuracy * 100:.2f}%")