# Project | Natural Language Processing Challenge
### Fake news classifier

Environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


Load train and test data


In [None]:
train_data_raw = pd.read_csv(
    "../data/training_data_lowercase.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)
test_data_raw = pd.read_csv(
    "../data/testing_data_lowercase_nolabels.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)

quick EDA

In [None]:
# preview
display(train_data_raw.head())

# shapes
print(
    f"Training data shape (rows, columns): {train_data_raw.shape}\n"
    f"Test data shape (rows, columns): {test_data_raw.shape}"
)


# fake news / real news balance
print("\nFake news / real news balance:")
print(train_data_raw["label"].value_counts(normalize=True))

# missing values
print("\nMissing values per column:")
print(train_data_raw.isnull().sum())

# empty text check
empty_texts = (train_data_raw["text"].str.strip() == "").sum()
print(f"\nNumber of empty text entries: {empty_texts}")


Training - validation split

In [None]:
X = train_data_raw["text"]
y = train_data_raw["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

Data cleaning

In [None]:
# removing only special characters and empty spaces
def clean_text(text):
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

X_train_clean = X_train.apply(clean_text)
X_test_clean  = X_test.apply(clean_text)

vectorization word2vec 
Word Embeddings vectorization

In [None]:
!pip install gensim

In [None]:
from gensim.models import Word2Vec
import numpy as np

# 1. Word2Vec requires a list of words (tokens) rather than a full string/sentence.
# We split the cleaned sentences into lists of words.
# 'str' conversion prevents crashes from empty rows (NaN) or numeric data, 
# ensuring consistent string input for the Word2Vec tokenizer.
train_tokens = [str(text).split() for text in X_train_clean]
test_tokens  = [str(text).split() for text in X_test_clean]

# 2. Training the Word2Vec model
# vector_size=100: each word will be represented by 100 numbers (dimensions)
# This part runs on CPU but prepares data for any type of model later.
w2v_model = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=1, workers=4)

print("Word2Vec vectorization model trained successfully!")

# 3. Function to create a single vector for a whole sentence (Average of word vectors)
# This averages all word vectors in a sentence to create a fixed-size input.
def get_sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(100) # Returns a vector of zeros if no words are found in the vocabulary
    return np.mean(vectors, axis=0)

# 4. Transform token lists into numerical matrices
# These matrices (X_train_w2v, X_test_w2v) are compatible with BOTH CPU and GPU models.
X_train_w2v = np.array([get_sentence_vector(t, w2v_model) for t in train_tokens])
X_test_w2v  = np.array([get_sentence_vector(t, w2v_model) for t in test_tokens])

print("Data successfully vectorized and ready for model training (CPU or GPU)!")

In [None]:
!pip install xgboost

Model training xgboost

In [None]:
from xgboost import XGBClassifier
import torch

# 1. New way to detect GPU for XGBoost 2.0+
# In newer versions, we use 'cuda' instead of 'gpu_hist'
if torch.cuda.is_available():
    device_method = 'cuda' 
    print("ðŸš€ GPU detected! Using 'cuda' for high-speed training.")
else:
    device_method = 'hist'
    print("ðŸ’» No GPU found. Using 'hist' on CPU.")

# 2. Initializing XGBoost
model = XGBClassifier(
    device=device_method,  # On utilise 'device' au lieu de 'tree_method' pour les versions rÃ©centes
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1
    random_state=42
)

# 3. Training
model.fit(X_train_w2v, y_train)

# 4. Accuracy
accuracy = model.score(X_test_w2v, y_test)
print(f"âœ… Training complete! My estimated accuracy: {accuracy * 100:.2f}%")

the final validation result for xgboost 

In [None]:
import pandas as pd
import numpy as np

# 1. Load the "unlabeled" dataset provided for the challenge
# This file contains the '2' labels that we need to predict
final_val_df = pd.read_csv("testing_data_lowercase_nolabels.csv", sep='\t', header=None)

# 2. Text Preprocessing & Vectorization
# We apply the same cleaning function used during training to ensure consistency
print("Step 1: Cleaning validation text")
val_texts_cleaned = final_val_df[1].apply(clean_text)

# Convert sentences into lists of tokens (words)
val_tokens = [str(t).split() for t in val_texts_cleaned]

# Convert tokens into numerical vectors using our trained Word2Vec model
print("Step 2: Vectorizing with Word2Vec")
X_final_val_w2v = np.array([get_sentence_vector(t, w2v_model) for t in val_tokens])

# 3. Model Prediction
print("Step 3: Running XGBoost prediction")
final_predictions = model.predict(X_final_val_w2v)

# 4. Final Formatting
# Replace the placeholder '2' with our model's predictions (0 or 1)
final_val_df[0] = final_predictions

# 5. Export results
# The output format must be tab-separated, no header, and no index
final_val_df.to_csv("validation_results_xgboost.csv", sep='\t', index=False, header=False)

print("The file 'validation_results_xgboost.csv' has been generated, that's the final file.")