In [1]:
import pandas as pd
import torch
import joblib
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, classification_report
import torch.nn.functional as F

# ✅ Load Dataset
df = pd.read_csv("/content/WELFake_Dataset.csv")

# Drop missing values
df = df.dropna(subset=['text'])
df['title'] = df['title'].fillna("")

# Combine title + text
df['combined_text'] = df['title'] + " " + df['text']

# Convert labels to binary (FAKE = 1, REAL = 0)
df['label'] = df['label'].astype(int)

print(f"✅ Data Loaded! Total samples: {len(df)}")

# ✅ Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
tokens = tokenizer(df['combined_text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")
labels = torch.tensor(df['label'].values)

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(tokens['input_ids'], labels, test_size=0.2, random_state=42)

# ✅ Create Data Loaders
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Faster training with batch size 16
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print("✅ Train/Test Split Done!")

# ✅ Load DistilBERT Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

# ✅ Define Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

print("✅ Model Loaded and Moved to", device)

# ✅ Train Model
model.train()
epochs = 3
for epoch in range(epochs):
    total_loss, total_correct = 0, 0

    for batch in train_loader:
        input_ids, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()

        outputs = model(input_ids, labels=labels)
        loss, logits = outputs.loss, outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (logits.argmax(dim=1) == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    avg_acc = total_correct / len(train_loader.dataset)

    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {avg_acc:.4f}")

# ✅ Evaluate Model
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, labels = [b.to(device) for b in batch]

        outputs = model(input_ids)
        logits = outputs.logits
        predictions = logits.argmax(dim=1).cpu().numpy()

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predictions)

acc = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred, target_names=['REAL', 'FAKE'])

print(f"✅ Model Accuracy: {acc:.4f}")
print("🔍 Classification Report:\n", report)

# ✅ Save Trained Model
torch.save(model.state_dict(), "fake_news_distilbert_model.pth")
joblib.dump(tokenizer, "distilbert_tokenizer.pkl")

print("✅ Model & Tokenizer Saved!")


✅ Data Loaded! Total samples: 72095


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Train/Test Split Done!


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


✅ Model Loaded and Moved to cuda
Epoch 1: Loss = 0.0392, Accuracy = 0.9852
Epoch 2: Loss = 0.0124, Accuracy = 0.9959
Epoch 3: Loss = 0.0061, Accuracy = 0.9980
✅ Model Accuracy: 0.9949
🔍 Classification Report:
               precision    recall  f1-score   support

        REAL       0.99      1.00      0.99      7010
        FAKE       1.00      0.99      0.99      7409

    accuracy                           0.99     14419
   macro avg       0.99      0.99      0.99     14419
weighted avg       0.99      0.99      0.99     14419

✅ Model & Tokenizer Saved!
