In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Load dataset
fake_data = pd.read_csv('Fake.csv')
true_data = pd.read_csv('True.csv')

In [None]:
fake_data['label'] = 0
true_data['label'] = 1
data = pd.concat([fake_data, true_data], ignore_index=True)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42, stratify=data['label']
)

In [None]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 256  # Reduced from 512 to speed up training


In [None]:
train_dataset = NewsDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len)
test_dataset = NewsDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_len)

In [None]:
# DataLoaders with optimizations
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
# Custom BERT Model with Increased Dropout
class CustomBERTModel(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):  # Increased dropout
        super(CustomBERTModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
        self.bert.dropout = torch.nn.Dropout(p=dropout_rate)  # Adjusted dropout

    def forward(self, input_ids, attention_mask, labels=None):
        return self.bert(input_ids, attention_mask=attention_mask, labels=labels)



In [None]:
# Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CustomBERTModel(dropout_rate=0.3).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for param in model.bert.bert.parameters():
  param.requires_grad = False

In [None]:
# Optimizer & Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=100, num_training_steps=len(train_loader) * 3)

In [None]:
scaler = torch.amp.GradScaler()  # Mixed Precision Training

In [None]:
# K-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f'\n==== Fold {fold + 1} ====')
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    fold_train_dataset = NewsDataset(X_fold_train.tolist(), y_fold_train.tolist(), tokenizer, max_len)
    fold_val_dataset = NewsDataset(X_fold_val.tolist(), y_fold_val.tolist(), tokenizer, max_len)

    fold_train_loader = DataLoader(fold_train_dataset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)
    fold_val_loader = DataLoader(fold_val_dataset, batch_size=16, shuffle=False, num_workers=2, pin_memory=True)

    # Training with Early Stopping
    best_val_accuracy = 0.0
    patience = 3  # Stop after 3 epochs of no improvement
    epochs_without_improvement = 0
    epochs = 3  # Set maximum number of epochs

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in fold_train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            with torch.amp.autocast(device_type=device.type):
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} Training Loss: {total_loss / len(fold_train_loader):.4f}")

        # Validation Evaluation
        model.eval()
        val_predictions, val_true_labels = [], []
        with torch.no_grad():
            for batch in fold_val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                val_predictions.extend(preds)
                val_true_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(val_true_labels, val_predictions)
        print(f'Fold {fold + 1} Validation Accuracy: {val_accuracy:.4f}')

        # Early Stopping
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            epochs_without_improvement = 0  # Reset
        else:
            epochs_without_improvement += 1

        if epochs_without_improvement >= patience:
            print(f'Early stopping at epoch {epoch+1}')
            break


==== Fold 1 ====
Epoch 1 Training Loss: 0.4898
Fold 1 Validation Accuracy: 0.8604
Epoch 2 Training Loss: 0.4484
Fold 1 Validation Accuracy: 0.8653
Epoch 3 Training Loss: 0.4264
Fold 1 Validation Accuracy: 0.8718

==== Fold 2 ====
Epoch 1 Training Loss: 0.4232
Fold 2 Validation Accuracy: 0.8767
Epoch 2 Training Loss: 0.4205
Fold 2 Validation Accuracy: 0.8767
Epoch 3 Training Loss: 0.4191
Fold 2 Validation Accuracy: 0.8767

==== Fold 3 ====
Epoch 1 Training Loss: 0.4202
Fold 3 Validation Accuracy: 0.8800
Epoch 2 Training Loss: 0.4206
Fold 3 Validation Accuracy: 0.8800
Epoch 3 Training Loss: 0.4205
Fold 3 Validation Accuracy: 0.8800

==== Fold 4 ====
Epoch 1 Training Loss: 0.4198
Fold 4 Validation Accuracy: 0.8801
Epoch 2 Training Loss: 0.4192
Fold 4 Validation Accuracy: 0.8801
Epoch 3 Training Loss: 0.4210
Fold 4 Validation Accuracy: 0.8801

==== Fold 5 ====
Epoch 1 Training Loss: 0.4217
Fold 5 Validation Accuracy: 0.8814
Epoch 2 Training Loss: 0.4200
Fold 5 Validation Accuracy: 0.8814


In [None]:
# Final Test Evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions)

print(f'\nTest Accuracy: {test_accuracy:.4f}')
print(f'Classification Report:\n{report}')



Test Accuracy: 0.8790
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.83      0.88      4696
           1       0.83      0.94      0.88      4284

    accuracy                           0.88      8980
   macro avg       0.88      0.88      0.88      8980
weighted avg       0.89      0.88      0.88      8980



In [None]:
# Save the entire model
torch.save(model, 'textual_news_model.pth')

# Save the model's state_dict
torch.save(model.state_dict(), 'textual_news_model_state_dict.pth')