In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls "/content/drive/My Drive"

'Colab Notebooks'		     'Newspaper websites link.gsheet'
'DepartmentalProject WorkStation'    'TextualDataAnalysis WorkStation'
 Important_Department_Project_Files  'Workspace '


In [None]:
!ls "/content/drive/My Drive/Colab Notebooks"

Fake_News_Analysis.ipynb  movement_new.ipynb  My_movement_2.ipynb
June25_dataset.xlsx	  my_data.csv


In [None]:
# 1. Install dependencies (only once)
# !pip install torch torchvision torchaudio transformers datasets scikit-learn

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# =========================
# 1. Load the dataset
# =========================
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/my_data.csv")

In [None]:
df.head(5)

Unnamed: 0,title,text,subject,date,label
0,Jennifer Lawrence: When You Attack Planned Pa...,Jennifer Lawrence has been speaking her mind l...,News,"January 6, 2016",0
1,Reince Priebus Says We ARE Going Forward With...,"On CNN s State of the Union yesterday, Reinc...",News,"November 21, 2016",0
2,Boiler Room EP #118,Tune in to the Alternate Current Radio Network...,US_News,"July 20, 2017",0
3,Factbox: Trump turnover - Tillerson would be l...,(Reuters) - The revolving door at the Trump Wh...,politicsNews,"November 30, 2017",1
4,WHY PAUL MANAFORT Indictment Is Bad News For D...,"This summarization by Peter Flaherty, Presiden...",left-news,"Oct 31, 2017",0


In [None]:
df.duplicated().sum()

np.int64(209)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
title,0
text,0
subject,0
date,0
label,0


In [None]:
df.drop(columns= ['title', 'subject', 'date'], inplace=True)

In [None]:
df.head(5)

Unnamed: 0,text,label
0,Jennifer Lawrence has been speaking her mind l...,0
1,"On CNN s State of the Union yesterday, Reinc...",0
2,Tune in to the Alternate Current Radio Network...,0
3,(Reuters) - The revolving door at the Trump Wh...,1
4,"This summarization by Peter Flaherty, Presiden...",0


### **BERT Model**

In [None]:
# 1️⃣ Install required packages
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW   # ✅ FIXED
import nltk
import string
from nltk.corpus import stopwords
from tqdm import tqdm

# -------------------------------
# 2️⃣ Load dataset
# -------------------------------
#df = pd.read_csv("news.csv")

# -------------------------------
# 3️⃣ Text preprocessing
# -------------------------------
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

df['text_clean'] = df['text'].apply(clean_text)

# -------------------------------
# 4️⃣ Train-test split
# -------------------------------
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text_clean'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# -------------------------------
# 5️⃣ Tokenization
# -------------------------------
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# -------------------------------
# 6️⃣ PyTorch Dataset
# -------------------------------
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

# -------------------------------
# 7️⃣ Load BERT model
# -------------------------------
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# -------------------------------
# 8️⃣ Training setup
# -------------------------------
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=5e-5)

# -------------------------------
# 9️⃣ Training loop with evaluation
# -------------------------------
epochs = 3

for epoch in range(epochs):
    # --- Training ---
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()
    print(f"Epoch {epoch+1} Training Loss: {train_loss/len(train_loader):.4f}")

    # --- Validation ---
    model.eval()
    val_preds = []
    val_labels_list = []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_labels_list.extend(batch['labels'].cpu().numpy())

    # Confusion Matrix
    cm = confusion_matrix(val_labels_list, val_preds)
    print(f"\nEpoch {epoch+1} Confusion Matrix:\n{cm}")

    # Classification Report
    report = classification_report(val_labels_list, val_preds, target_names=['Real', 'Fake'])
    print(f"\nEpoch {epoch+1} Classification Report:\n{report}")

    # Overall Accuracy
    acc = accuracy_score(val_labels_list, val_preds)
    print(f"Epoch {epoch+1} Validation Accuracy: {acc:.4f}\n{'-'*60}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 4469/4469 [54:03<00:00,  1.38it/s]


Epoch 1 Training Loss: 0.0542

Epoch 1 Confusion Matrix:
[[4594   60]
 [   1 4283]]

Epoch 1 Classification Report:
              precision    recall  f1-score   support

        Real       1.00      0.99      0.99      4654
        Fake       0.99      1.00      0.99      4284

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938

Epoch 1 Validation Accuracy: 0.9932
------------------------------------------------------------


Training Epoch 2: 100%|██████████| 4469/4469 [54:09<00:00,  1.38it/s]


Epoch 2 Training Loss: 0.0354

Epoch 2 Confusion Matrix:
[[4594   60]
 [   1 4283]]

Epoch 2 Classification Report:
              precision    recall  f1-score   support

        Real       1.00      0.99      0.99      4654
        Fake       0.99      1.00      0.99      4284

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938

Epoch 2 Validation Accuracy: 0.9932
------------------------------------------------------------


Training Epoch 3:   0%|          | 10/4469 [00:07<59:25,  1.25it/s]


KeyboardInterrupt: 

### **DistilBERT**

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
import nltk
import string
from nltk.corpus import stopwords
from tqdm import tqdm

# -------------------------------
# 1️⃣ Text preprocessing
# -------------------------------
'''nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Example dataset: replace this with your CSV
# df = pd.read_csv("your_dataset.csv")  # Ensure it has 'text' and 'label' columns
df['text_clean'] = df['text'].apply(clean_text)'''

# -------------------------------
# 2️⃣ Train-test split
# -------------------------------
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text_clean'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# -------------------------------
# 3️⃣ Load DistilBERT tokenizer and model
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# -------------------------------
# 4️⃣ Tokenize the text
# -------------------------------
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# -------------------------------
# 5️⃣ PyTorch Dataset
# -------------------------------
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

# -------------------------------
# 6️⃣ Setup device and data loaders
# -------------------------------
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=5e-5)

# -------------------------------
# 7️⃣ Training loop
# -------------------------------
epochs = 1

for epoch in range(epochs):
    # --- Training ---
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()
    print(f"Epoch {epoch+1} Training Loss: {train_loss/len(train_loader):.4f}")

    # --- Validation ---
    model.eval()
    val_preds = []
    val_labels_list = []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_labels_list.extend(batch['labels'].cpu().numpy())

    # Confusion Matrix
    cm = confusion_matrix(val_labels_list, val_preds)
    print(f"\nEpoch {epoch+1} Confusion Matrix:\n{cm}")

    # Classification Report
    report = classification_report(val_labels_list, val_preds, target_names=['Real', 'Fake'])
    print(f"\nEpoch {epoch+1} Classification Report:\n{report}")

    # Overall Accuracy
    acc = accuracy_score(val_labels_list, val_preds)
    print(f"Epoch {epoch+1} Validation Accuracy: {acc:.4f}\n{'-'*60}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 4469/4469 [27:17<00:00,  2.73it/s]


Epoch 1 Training Loss: 0.0148

Epoch 1 Confusion Matrix:
[[4653    1]
 [   2 4282]]

Epoch 1 Classification Report:
              precision    recall  f1-score   support

        Real       1.00      1.00      1.00      4654
        Fake       1.00      1.00      1.00      4284

    accuracy                           1.00      8938
   macro avg       1.00      1.00      1.00      8938
weighted avg       1.00      1.00      1.00      8938

Epoch 1 Validation Accuracy: 0.9997
------------------------------------------------------------


### **RoBERTa**

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
import nltk
import string
from nltk.corpus import stopwords
from tqdm import tqdm

# -------------------------------
# 1️⃣ Text preprocessing
# -------------------------------
'''nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Example dataset: replace this with your CSV
# df = pd.read_csv("your_dataset.csv")  # must have 'text' and 'label' columns
df['text_clean'] = df['text'].apply(clean_text)'''

# -------------------------------
# 2️⃣ Train-test split
# -------------------------------
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text_clean'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# -------------------------------
# 3️⃣ Load RoBERTa tokenizer and model
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

# -------------------------------
# 4️⃣ Tokenize the text
# -------------------------------
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# -------------------------------
# 5️⃣ PyTorch Dataset
# -------------------------------
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

# -------------------------------
# 6️⃣ Setup device and data loaders
# -------------------------------
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

optimizer = AdamW(model.parameters(), lr=5e-5)

# -------------------------------
# 7️⃣ Training loop
# -------------------------------
epochs = 1

for epoch in range(epochs):
    # --- Training ---
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()
    print(f"Epoch {epoch+1} Training Loss: {train_loss/len(train_loader):.4f}")

    # --- Validation ---
    model.eval()
    val_preds = []
    val_labels_list = []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_labels_list.extend(batch['labels'].cpu().numpy())

    # Confusion Matrix
    cm = confusion_matrix(val_labels_list, val_preds)
    print(f"\nEpoch {epoch+1} Confusion Matrix:\n{cm}")

    # Classification Report
    report = classification_report(val_labels_list, val_preds, target_names=['Real', 'Fake'])
    print(f"\nEpoch {epoch+1} Classification Report:\n{report}")

    # Overall Accuracy
    acc = accuracy_score(val_labels_list, val_preds)
    print(f"Epoch {epoch+1} Validation Accuracy: {acc:.4f}\n{'-'*60}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 4469/4469 [54:39<00:00,  1.36it/s]


Epoch 1 Training Loss: 0.0708

Epoch 1 Confusion Matrix:
[[4593   61]
 [   9 4275]]

Epoch 1 Classification Report:
              precision    recall  f1-score   support

        Real       1.00      0.99      0.99      4654
        Fake       0.99      1.00      0.99      4284

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938

Epoch 1 Validation Accuracy: 0.9922
------------------------------------------------------------
