In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download("wordnet")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


In [None]:
train_data_raw = pd.read_csv(
    "../data/training_data_lowercase.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)
test_data_raw = pd.read_csv(
    "../data/testing_data_lowercase_nolabels.csv",
    sep="\t",
    header=None,
    names=["label", "text"]
)

In [None]:
# preview
display(train_data_raw.head())

# shapes
print(
    f"Training data shape (rows, columns): {train_data_raw.shape}\n"
    f"Test data shape (rows, columns): {test_data_raw.shape}"
)


# fake news / real news balance
print("\nFake news / real news balance:")
print(train_data_raw["label"].value_counts(normalize=True))

# missing values
print("\nMissing values per column:")
print(train_data_raw.isnull().sum())

# empty text check
empty_texts = (train_data_raw["text"].str.strip() == "").sum()
print(f"\nNumber of empty text entries: {empty_texts}")


In [None]:
X = train_data_raw["text"]
y = train_data_raw["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
# removing only special characters and empty spaces
def clean_text(text):
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

X_train_clean = X_train.apply(clean_text)
X_test_clean  = X_test.apply(clean_text)

In [None]:
!pip install gensim

In [None]:
import pandas as pd
import numpy as np

# 1. Load the "unlabeled" dataset provided for the challenge
# This file contains the '2' labels that we need to predict
final_val_df = pd.read_csv("testing_data_lowercase_nolabels.csv", sep='\t', header=None)

# 2. Text Preprocessing & Vectorization
# We apply the same cleaning function used during training to ensure consistency
print("Step 1: Cleaning validation text")
val_texts_cleaned = final_val_df[1].apply(clean_text)

# Convert sentences into lists of tokens (words)
val_tokens = [str(t).split() for t in val_texts_cleaned]

# Convert tokens into numerical vectors using our trained Word2Vec model
print("Step 2: Vectorizing with Word2Vec")
X_final_val_w2v = np.array([get_sentence_vector(t, w2v_model) for t in val_tokens])

# 3. Model Prediction
print("Step 3: Running XGBoost prediction")
final_predictions = model.predict(X_final_val_w2v)

# 4. Final Formatting
# Replace the placeholder '2' with our model's predictions (0 or 1)
final_val_df[0] = final_predictions

# 5. Export results
# The output format must be tab-separated, no header, and no index
final_val_df.to_csv("validation_results_xgboost.csv", sep='\t', index=False, header=False)

print("The file 'validation_results_xgboost.csv' has been generated, that's the final file.")

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier
import torch

# 1. New way to detect GPU for XGBoost 2.0+
# In newer versions, we use 'cuda' instead of 'gpu_hist'
if torch.cuda.is_available():
    device_method = 'cuda' 
    print("ðŸš€ GPU detected! Using 'cuda' for high-speed training.")
else:
    device_method = 'hist'
    print("ðŸ’» No GPU found. Using 'hist' on CPU.")

# 2. Initializing XGBoost
model = XGBClassifier(
    device=device_method, 
    n_estimators=1000,      # On augmente beaucoup le nombre d'itÃ©rations
    learning_rate=0.02,     # On rÃ©duit le taux pour Ãªtre trÃ¨s prÃ©cis
    max_depth=9,            # On permet au modÃ¨le d'analyser des relations plus complexes
    random_state=42
)

# 3. Training
model.fit(X_train_w2v, y_train)

# 4. Accuracy
accuracy = model.score(X_test_w2v, y_test)
print(f"âœ… Training complete! My estimated accuracy: {accuracy * 100:.2f}%")

In [None]:
import pandas as pd
import numpy as np

# 1. Load the "unlabeled" dataset provided for the challenge
# This file contains the '2' labels that we need to predict
final_val_df = pd.read_csv("testing_data_lowercase_nolabels.csv", sep='\t', header=None)

# 2. Text Preprocessing & Vectorization
# We apply the same cleaning function used during training to ensure consistency
print("Step 1: Cleaning validation text")
val_texts_cleaned = final_val_df[1].apply(clean_text)

# Convert sentences into lists of tokens (words)
val_tokens = [str(t).split() for t in val_texts_cleaned]

# Convert tokens into numerical vectors using our trained Word2Vec model
print("Step 2: Vectorizing with Word2Vec")
X_final_val_w2v = np.array([get_sentence_vector(t, w2v_model) for t in val_tokens])

# 3. Model Prediction
print("Step 3: Running XGBoost prediction")
final_predictions = model.predict(X_final_val_w2v)

# 4. Final Formatting
# Replace the placeholder '2' with our model's predictions (0 or 1)
final_val_df[0] = final_predictions

# 5. Export results
# The output format must be tab-separated, no header, and no index
final_val_df.to_csv("validation_results_xgboost2.csv", sep='\t', index=False, header=False)

print("The file 'validation_results_xgboost2.csv' has been generated, that's the final file.")