In [None]:
# === 1. Import required libraries ===
# Core data manipulation & visualization libraries, NLP, ML models and evaluation metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, average_precision_score,
    precision_recall_curve
)


In [4]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# === 2. Load and prepare data ===
# Download and load spaCy's medium English model
!python -m spacy download en_core_web_md

In [6]:
nlp = spacy.load('en_core_web_md')

In [None]:
# Load Fake and True news datasets
df_fake = pd.read_csv('/content/drive/MyDrive/DATASET/Fake.csv')
df_true = pd.read_csv('/content/drive/MyDrive/DATASET/True.csv')

In [None]:
# Assign labels (0 = fake, 1 = true)
df_fake['lable'] = 0 #fake
df_true['lable'] = 1 #true

In [None]:
# Combine datasets and shuffle
df = pd.concat([df_fake, df_true], axis=0).reset_index(drop=True)
print(df.shape)
df.head()

(44898, 5)


Unnamed: 0,title,text,subject,date,lable
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [10]:
print(df.info())
print(df['lable'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   lable    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB
None
lable
0    23481
1    21417
Name: count, dtype: int64


In [11]:
df = df.drop(columns=['subject', 'date'], errors='ignore')
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# === 3. Text preprocessing ===
# Merge title and text into a single column
df['full_text'] = df['title'] + " " + df['text']
df = df.drop(columns=['title', 'text'], errors='ignore')
df.head()

Unnamed: 0,lable,full_text
0,0,Ben Stein Calls Out 9th Circuit Court: Committ...
1,1,Trump drops Steve Bannon from National Securit...
2,1,Puerto Rico expects U.S. to lift Jones Act shi...
3,0,OOPS: Trump Just Accidentally Confirmed He Le...
4,1,Donald Trump heads for Scotland to reopen a go...


In [None]:
# Function to get sentence/document vector using spaCy
def get_vector(text):
  doc = nlp(text)
  if len(doc) == 0:
    return np.zeros(nlp.vocab.vectors_length)
  return doc.vector

# Apply vectorization
df['vector'] = df['full_text'].apply(get_vector)

In [14]:
df.head()

Unnamed: 0,lable,full_text,vector
0,0,Ben Stein Calls Out 9th Circuit Court: Committ...,"[-0.6819091, 0.179817, -0.047273327, 0.0067938..."
1,1,Trump drops Steve Bannon from National Securit...,"[-0.71277297, 0.17076388, -0.08131767, -0.0148..."
2,1,Puerto Rico expects U.S. to lift Jones Act shi...,"[-0.7009828, 0.20720093, -0.09281823, -0.08263..."
3,0,OOPS: Trump Just Accidentally Confirmed He Le...,"[-0.65475076, 0.13825427, -0.13649306, -0.0085..."
4,1,Donald Trump heads for Scotland to reopen a go...,"[-0.71052384, 0.20840245, -0.070762776, -0.043..."


In [None]:
# Save embeddings for later use
df.to_csv("embedding.csv", index=False)

In [None]:
# === 4. Train-test split ===
X = df['vector']
y = df['lable']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=42)

Unnamed: 0,vector
0,"[-0.6819091, 0.179817, -0.047273327, 0.0067938..."
1,"[-0.71277297, 0.17076388, -0.08131767, -0.0148..."
2,"[-0.7009828, 0.20720093, -0.09281823, -0.08263..."
3,"[-0.65475076, 0.13825427, -0.13649306, -0.0085..."
4,"[-0.71052384, 0.20840245, -0.070762776, -0.043..."
...,...
44893,"[-0.659338, 0.19236958, 0.038105644, -0.069554..."
44894,"[-0.6765815, 0.20781848, -0.07786325, -0.02437..."
44895,"[-0.6690377, 0.22079496, -0.1649775, -0.025795..."
44896,"[-0.6356763, 0.18357103, -0.11556566, -0.06194..."


In [None]:
# Ensure all vectors are of the same dimension and stack them into a matrix
import numpy as np

vecs = [np.array(v, dtype=np.float32) for v in df["vector"].tolist()]

dim = len(vecs[0])
assert all(len(v) == dim for v in vecs), " The lengths of the vectors are not equal "

X = np.vstack(vecs)
print("X shape:", X.shape, "dtype:", X.dtype)

X shape: (44898, 300) dtype: float32


In [None]:
# === 5. LightGBM model training and evaluation ===
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(
    objective="binary",
    class_weight="balanced",
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42,
    n_jobs=-1
)
lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)
y_proba = lgbm.predict_proba(X_test)[:,1]

print("=== LightGBM ===")
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

[LightGBM] [Info] Number of positive: 18204, number of negative: 19959
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.300960 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 38163, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
=== LightGBM ===
ROC-AUC: 0.9977628504869044
              precision    recall  f1-score   support

           0     0.9818    0.9804    0.9811      3522
           1     0.9786    0.9801    0.9793      3213

    accuracy                         0.9803      6735
   macro avg     0.9802    0.9802    0.9802      6735
weighted avg     0.9803    0.9803    0.9803      6735

Confusion matrix:
 [[3453   69]
 [  64 3149]]


In [None]:
# === 6. MLP model training and evaluation ===
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(256, 128), activation="relu", solver="adam",
                    alpha=1e-4, max_iter=50, random_state=42, early_stopping=True)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)
y_proba = mlp.predict_proba(X_test)[:,1]

print("=== MLP ===")
print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

=== MLP ===
ROC-AUC: 0.9983190007658058
              precision    recall  f1-score   support

           0     0.9800    0.9872    0.9836      3522
           1     0.9859    0.9779    0.9819      3213

    accuracy                         0.9828      6735
   macro avg     0.9829    0.9826    0.9827      6735
weighted avg     0.9828    0.9828    0.9828      6735

Confusion matrix:
 [[3477   45]
 [  71 3142]]


In [None]:
# Searching parameters for better output
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import loguniform

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.15, random_state=42)

lgbm = lgb.LGBMClassifier(
    objective="binary",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

param_grid = {
    "num_leaves": [31, 63, 127],
    "max_depth": [-1, 6, 10],
    "learning_rate": loguniform(1e-3, 0.3),
    "n_estimators": [200, 500, 1000],
    "min_child_samples": [10, 20, 50]
}

search = RandomizedSearchCV(lgbm, param_grid, n_iter=20, scoring="f1", cv=3, n_jobs=-1, random_state=42)
search.fit(X_train, y_train)
print("Best params:", search.best_params_)
print("Test score:", search.score(X_test, y_test))