In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [10]:
df = pd.read_csv("/content/WELFake_Dataset_Preprocessed.csv")

In [11]:
df['text'] = df['text'].fillna('').astype(str)

In [12]:
# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [13]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
# Tokenization with max_length=128
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [15]:
# Dataset class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [16]:
# Prepare datasets
train_dataset = NewsDataset(train_encodings, train_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

In [17]:
# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Training arguments optimized for Colab (fast training)
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy="epoch",    # Evaluate each epoch
    learning_rate=3e-5,             # Slightly higher learning rate for faster convergence
    per_device_train_batch_size=8,  # Small batch size to avoid GPU memory issues
    per_device_eval_batch_size=8,   # Match batch size for evaluation
    num_train_epochs=1,             # Single epoch for quick training
    weight_decay=0.01,
    logging_dir='./logs',           # Directory for logs
    logging_steps=10,               # Log every 10 steps
    save_steps=500,                 # Save every 500 steps
    save_total_limit=1,             # Save only the last checkpoint
    fp16=True,                      # Mixed precision for faster training
    report_to="none"                # Disable W&B logging
)




In [23]:
# Metric function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [24]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [25]:
# Train
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1327,0.106261,0.974169,0.97591,0.966457,0.971161


TrainOutput(global_step=6194, training_loss=0.1595013274366597, metrics={'train_runtime': 755.9483, 'train_samples_per_second': 65.545, 'train_steps_per_second': 8.194, 'total_flos': 3259222420508160.0, 'train_loss': 0.1595013274366597, 'epoch': 1.0})

In [26]:
# Evaluate
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.10626117885112762, 'eval_accuracy': 0.9741685502098806, 'eval_precision': 0.9759101612026807, 'eval_recall': 0.966457399103139, 'eval_f1': 0.9711607786589762, 'eval_runtime': 35.8668, 'eval_samples_per_second': 345.389, 'eval_steps_per_second': 43.188, 'epoch': 1.0}


In [27]:
# Save the model
model.save_pretrained('./saved_model')

# Save the tokenizer
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

Loading the Model and Tokenizer for Future Use
When you want to load the model and tokenizer again, you can use the following code:

In [28]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the trained model
model = BertForSequenceClassification.from_pretrained('./saved_model')

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('./saved_model')


In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Ensure stopwords and lemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function (same as you applied to the dataframe)
def clean_text(text):
    # 1. Convert text to lowercase
    text = text.lower()

    # 2. Remove HTML tags (if any)
    text = re.sub(r'<.*?>', '', text)

    # 3. Remove special characters, numbers, and keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 4. Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # 5. Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # 6. Lemmatization (to get the root form of words)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

# Set model to evaluation mode
model.eval()

# Function to predict
def predict(text):
    # Clean the input text
    cleaned_text = clean_text(text)

    # Tokenize the cleaned text
    inputs = tokenizer(cleaned_text, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Make prediction (no gradient computation needed)
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted label (0 or 1)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # Interpret the result
    if predicted_class == 0:
        return "Fake News"
    else:
        return "Real News"

# Take user input
user_input = input("Enter the news text: ")

# Predict
result = predict(user_input)
print(f"Prediction: {result}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Prediction: Real News
