In [None]:
# STEP 1: Install dependencies
!pip install pandas tqdm wget

import pandas as pd
import os
import wget
from zipfile import ZipFile

# Create a data directory
os.makedirs("data", exist_ok=True)


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=9e96365111a20adaaa2a8a18238f77f619fce12613d9124ed0b127b72cc4feb9
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import pandas as pd

# Load datasets
df_news = pd.read_csv("/content/news_dataset.csv")
df_pol_real = pd.read_csv("/content/politifact_real.csv")
df_pol_fake = pd.read_csv("/content/politifact_fake.csv")
df_gos_real = pd.read_csv("/content/gossipcop_real.csv")
df_gos_fake = pd.read_csv("/content/gossipcop_fake.csv")

# news_dataset.csv is already in correct format
df_news_clean = df_news[['text', 'label']]

# Politifact real — use 'title' column as text
df_pol_real_clean = df_pol_real[['title']].rename(columns={'title': 'text'})
df_pol_real_clean['label'] = 'REAL'

# Politifact fake
df_pol_fake_clean = df_pol_fake[['title']].rename(columns={'title': 'text'})
df_pol_fake_clean['label'] = 'FAKE'

# Gossipcop real
df_gos_real_clean = df_gos_real[['title']].rename(columns={'title': 'text'})
df_gos_real_clean['label'] = 'REAL'

# Gossipcop fake
df_gos_fake_clean = df_gos_fake[['title']].rename(columns={'title': 'text'})
df_gos_fake_clean['label'] = 'FAKE'

# Combine all
df_final = pd.concat([
    df_news_clean,
    df_pol_real_clean,
    df_pol_fake_clean,
    df_gos_real_clean,
    df_gos_fake_clean
], ignore_index=True)

# Remove missing or empty texts
df_final.dropna(subset=['text'], inplace=True)
df_final = df_final[df_final['text'].str.strip() != '']

# Shuffle
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

# Save
df_final.to_csv("/content/fake_news_combined.csv", index=False)

# Summary
print("✅ Combined dataset created and saved as 'fake_news_combined.csv'")
print("📊 Total samples:", len(df_final))
print("🔢 Label distribution:\n", df_final['label'].value_counts())


✅ Combined dataset created and saved as 'fake_news_combined.csv'
📊 Total samples: 26917
🔢 Label distribution:
 label
REAL    19291
FAKE     7626
Name: count, dtype: int64


In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
# Load the combined CSV correctly
df = pd.read_csv('/content/fake_news_combined.csv')

# Clean the text column
df['text'] = df['text'].astype(str)

# Define the cleaning function (if not already defined)
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Save cleaned file
df.to_csv('/content/fake_news_preprocessed.csv', index=False)
print("✅ Preprocessed file saved as 'fake_news_preprocessed.csv'")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


✅ Preprocessed file saved as 'fake_news_preprocessed.csv'


In [None]:
# Reload preprocessed data
df = pd.read_csv('/content/fake_news_preprocessed.csv')

# Normalize labels
df['label'] = df['label'].replace({
    'FAKE': 0,
    'REAL': 1
}).astype(int)

print(df['label'].value_counts())


label
1    19291
0     7626
Name: count, dtype: int64


  df['label'] = df['label'].replace({


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Optional: install xgboost if not installed
# !pip install xgboost
from xgboost import XGBClassifier

# Load dataset
df = pd.read_csv('/content/fake_news_preprocessed.csv')

# Replace labels and convert to int
df['label'] = df['label'].replace({'FAKE': 0, 'REAL': 1}).astype(int)

# Handle missing values in text column
df = df.dropna(subset=['clean_text'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['label'], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Calculate scale_pos_weight for XGBoost
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

def evaluate_model(name, model):
    print(f"\n{'='*20} {name} {'='*20}")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc * 100:.2f}%")
    print(classification_report(y_test, y_pred))

# Logistic Regression
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
evaluate_model("Logistic Regression", lr)

# Linear SVM
svm = LinearSVC(class_weight='balanced', max_iter=5000)
evaluate_model("Linear SVM", svm)

# Random Forest
rf = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
evaluate_model("Random Forest", rf)

# XGBoost
xgb = XGBClassifier(
    use_label_encoder=False, eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    n_estimators=100, random_state=42)
evaluate_model("XGBoost", xgb)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
evaluate_model("Decision Tree", dt)


  df['label'] = df['label'].replace({'FAKE': 0, 'REAL': 1}).astype(int)



Logistic Regression Accuracy: 82.31%
              precision    recall  f1-score   support

           0       0.66      0.77      0.71      1531
           1       0.90      0.84      0.87      3852

    accuracy                           0.82      5383
   macro avg       0.78      0.81      0.79      5383
weighted avg       0.83      0.82      0.83      5383


Linear SVM Accuracy: 81.76%
              precision    recall  f1-score   support

           0       0.65      0.77      0.71      1531
           1       0.90      0.84      0.87      3852

    accuracy                           0.82      5383
   macro avg       0.78      0.80      0.79      5383
weighted avg       0.83      0.82      0.82      5383


Random Forest Accuracy: 84.56%
              precision    recall  f1-score   support

           0       0.77      0.65      0.71      1531
           1       0.87      0.92      0.90      3852

    accuracy                           0.85      5383
   macro avg       0.82      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 83.76%
              precision    recall  f1-score   support

           0       0.72      0.71      0.71      1531
           1       0.89      0.89      0.89      3852

    accuracy                           0.84      5383
   macro avg       0.80      0.80      0.80      5383
weighted avg       0.84      0.84      0.84      5383


Decision Tree Accuracy: 81.42%
              precision    recall  f1-score   support

           0       0.68      0.66      0.67      1531
           1       0.86      0.88      0.87      3852

    accuracy                           0.81      5383
   macro avg       0.77      0.77      0.77      5383
weighted avg       0.81      0.81      0.81      5383

