In [92]:
!pip install readability-lxml
!pip install lxml[html_clean]



In [93]:
!pip install readability




In [94]:
!pip install nrclex




In [95]:
from nrclex import NRCLex


In [96]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob
from readability import Document # Changed import from Readability to Document
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [96]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from readability import Document
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

In [None]:
# Download punkt_tab instead of punkt
nltk.download('punkt_tab')

In [None]:

# Load and Concatenate BuzzFeed Dataset
def load_buzzfeed_data():
    # Replace with actual paths to true and fake files
    true_data = pd.read_csv("/content/BuzzFeed_real_news_content.csv")
    fake_data = pd.read_csv("/content/BuzzFeed_fake_news_content.csv")

    true_data['label'] = 1
    fake_data['label'] = 0

    data = pd.concat([true_data, fake_data], ignore_index=True)
    return data

In [None]:
# Preprocessing Text
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

In [None]:
def extract_features(data):
    data['word_count'] = data['text'].apply(lambda x: len(word_tokenize(x)))
    data['sentence_count'] = data['text'].apply(lambda x: len(sent_tokenize(x)))
    data['sentiment'] = data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Readability
    def get_readability_score(text):
        try:
            r = Document(text)
            return r.flesch_kincaid().score
        except:
            return np.nan

    data['readability'] = data['text'].apply(get_readability_score)

    return data

In [None]:

# Dataset Preparation
class BuzzFeedDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, features, max_length=128):
        self.texts = texts
        self.labels = labels
        self.features = features
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        features = self.features[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'features': torch.tensor(features, dtype=torch.float32),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Model with fixed classifier input size
class RoBERTaWithFeatures(torch.nn.Module):
    def __init__(self, base_model, num_features, num_labels=2):
        super(RoBERTaWithFeatures, self).__init__()
        self.roberta = base_model
        # The classifier needs to handle both the RoBERTa output and the additional features
        self.classifier = torch.nn.Linear(self.roberta.config.hidden_size + num_features, num_labels)

    def forward(self, input_ids, attention_mask, features):
        # Get the hidden states from RoBERTa (with output_hidden_states=True)
        outputs = self.roberta(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states  # Hidden states include all layers

        # The first token ([CLS]) is the pooled output for classification
        pooled_output = hidden_states[-1][:, 0, :]  # Take the [CLS] token from the last layer

        # Concatenate RoBERTa output with additional features
        combined_input = torch.cat((pooled_output, features), dim=1)

        # Pass through classifier layer
        logits = self.classifier(combined_input)
        return logits

In [None]:

# Training Function with Mixed Precision and Gradient Accumulation
def train_model(model, train_loader, val_loader, optimizer, epochs, device):
    model.to(device)
    scaler = GradScaler()  # For mixed precision training
    gradient_accumulation_steps = 4  # Adjust this value based on your memory limitations

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()  # Zero the gradients at the start of each epoch
        for step, batch in enumerate(tqdm(train_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            features = batch['features'].to(device)
            labels = batch['label'].to(device)

            with autocast():  # Mixed precision
                outputs = model(input_ids, attention_mask=attention_mask, features=features)
                loss = torch.nn.CrossEntropyLoss()(outputs, labels)

            scaler.scale(loss).backward()

            # Accumulate gradients over multiple steps
            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

        # Clear GPU cache after each epoch
        torch.cuda.empty_cache()

        evaluate_model(model, val_loader, device)

In [None]:
# Evaluation Function
def evaluate_model(model, val_loader, device):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            features = batch['features'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, features=features)
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Compute evaluation metrics (Accuracy, F1, Precision, Recall)
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)
    precision = precision_score(true_labels, preds)
    recall = recall_score(true_labels, preds)
    print(f"Accuracy: {acc}, F1-Score: {f1}, Precision: {precision}, Recall: {recall}")

In [110]:

# Main Execution
def main():
    data = load_buzzfeed_data()
    data['text'] = data['text'].apply(preprocess_text)
    data = extract_features(data)

    # Specify feature columns explicitly, excluding 'text'
    feature_columns = ['word_count', 'sentence_count', 'sentiment', 'readability']
    features = data[feature_columns].values

    # Fill NaN values with 0 to ensure numeric conversion works
    features = np.nan_to_num(features, nan=0.0).astype(np.float32)
    labels = data['label'].values
    texts = data['text']

    # Train-test split
    train_texts, val_texts, train_features, val_features, train_labels, val_labels = train_test_split(
        texts, features, labels, test_size=0.2, random_state=42
    )

    # Load tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    # Load the custom model with additional features
    model = RoBERTaWithFeatures(base_model, num_features=len(feature_columns))

    # Prepare dataset and dataloaders
    train_dataset = BuzzFeedDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, train_features)
    val_dataset = BuzzFeedDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, val_features)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=1e-5)

    # Train and evaluate
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_model(model, train_loader, val_loader, optimizer, epochs=30, device=device)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()  # For mixed precision training
  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:05<00:00,  3.45it/s]


Epoch 1, Loss: 4.651791396893953
Accuracy: 0.5135135135135135, F1-Score: 0.6785714285714286, Precision: 0.5135135135135135, Recall: 1.0


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  7.91it/s]


Epoch 2, Loss: 5.13928764352673
Accuracy: 0.5135135135135135, F1-Score: 0.6785714285714286, Precision: 0.5135135135135135, Recall: 1.0


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  6.38it/s]


Epoch 3, Loss: 4.6013162692910745
Accuracy: 0.5675675675675675, F1-Score: 0.7037037037037037, Precision: 0.5428571428571428, Recall: 1.0


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  6.67it/s]


Epoch 4, Loss: 4.473723025698411
Accuracy: 0.5675675675675675, F1-Score: 0.7037037037037037, Precision: 0.5428571428571428, Recall: 1.0


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:04<00:00,  4.38it/s]


Epoch 5, Loss: 3.744517618104031
Accuracy: 0.5675675675675675, F1-Score: 0.7037037037037037, Precision: 0.5428571428571428, Recall: 1.0


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:03<00:00,  5.24it/s]


Epoch 6, Loss: 3.101152954917205
Accuracy: 0.5135135135135135, F1-Score: 0.5909090909090909, Precision: 0.52, Recall: 0.6842105263157895


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  6.64it/s]


Epoch 7, Loss: 2.675400658657676
Accuracy: 0.5675675675675675, F1-Score: 0.5789473684210527, Precision: 0.5789473684210527, Recall: 0.5789473684210527


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  9.28it/s]


Epoch 8, Loss: 2.187959671020508
Accuracy: 0.5135135135135135, F1-Score: 0.47058823529411764, Precision: 0.5333333333333333, Recall: 0.42105263157894735


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  8.26it/s]


Epoch 9, Loss: 2.099791363665932
Accuracy: 0.4864864864864865, F1-Score: 0.42424242424242425, Precision: 0.5, Recall: 0.3684210526315789


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  7.96it/s]


Epoch 10, Loss: 2.1202600378739205
Accuracy: 0.5405405405405406, F1-Score: 0.5405405405405406, Precision: 0.5555555555555556, Recall: 0.5263157894736842


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.08it/s]


Epoch 11, Loss: 2.0980003821222404
Accuracy: 0.5405405405405406, F1-Score: 0.5142857142857142, Precision: 0.5625, Recall: 0.47368421052631576


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.05it/s]


Epoch 12, Loss: 1.891969509814915
Accuracy: 0.5675675675675675, F1-Score: 0.5555555555555556, Precision: 0.5882352941176471, Recall: 0.5263157894736842


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00,  9.90it/s]


Epoch 13, Loss: 1.7649323814793636
Accuracy: 0.5945945945945946, F1-Score: 0.6153846153846154, Precision: 0.6, Recall: 0.631578947368421


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00,  9.84it/s]


Epoch 14, Loss: 1.4959173751504797
Accuracy: 0.6486486486486487, F1-Score: 0.6666666666666666, Precision: 0.65, Recall: 0.6842105263157895


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  7.25it/s]


Epoch 15, Loss: 1.1662890628764504
Accuracy: 0.6756756756756757, F1-Score: 0.7391304347826086, Precision: 0.6296296296296297, Recall: 0.8947368421052632


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.19it/s]


Epoch 16, Loss: 1.1878675975297626
Accuracy: 0.6756756756756757, F1-Score: 0.6666666666666666, Precision: 0.7058823529411765, Recall: 0.631578947368421


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.15it/s]


Epoch 17, Loss: 0.9940542194404101
Accuracy: 0.7027027027027027, F1-Score: 0.7555555555555555, Precision: 0.6538461538461539, Recall: 0.8947368421052632


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00,  9.80it/s]


Epoch 18, Loss: 1.0007388152574237
Accuracy: 0.7297297297297297, F1-Score: 0.7619047619047619, Precision: 0.6956521739130435, Recall: 0.8421052631578947


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.03it/s]


Epoch 19, Loss: 0.730717786048588
Accuracy: 0.7297297297297297, F1-Score: 0.7619047619047619, Precision: 0.6956521739130435, Recall: 0.8421052631578947


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  7.55it/s]


Epoch 20, Loss: 0.6952916666081077
Accuracy: 0.7297297297297297, F1-Score: 0.7727272727272727, Precision: 0.68, Recall: 0.8947368421052632


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  9.09it/s]


Epoch 21, Loss: 0.7302259810661015
Accuracy: 0.7567567567567568, F1-Score: 0.7804878048780488, Precision: 0.7272727272727273, Recall: 0.8421052631578947


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.11it/s]


Epoch 22, Loss: 0.6086948662996292
Accuracy: 0.7567567567567568, F1-Score: 0.8, Precision: 0.6923076923076923, Recall: 0.9473684210526315


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.04it/s]


Epoch 23, Loss: 0.57399398951154
Accuracy: 0.7567567567567568, F1-Score: 0.8, Precision: 0.6923076923076923, Recall: 0.9473684210526315


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:01<00:00, 10.12it/s]


Epoch 24, Loss: 0.7994742895427503
Accuracy: 0.7567567567567568, F1-Score: 0.7906976744186046, Precision: 0.7083333333333334, Recall: 0.8947368421052632


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  8.93it/s]


Epoch 25, Loss: 0.5142795294523239
Accuracy: 0.7567567567567568, F1-Score: 0.8, Precision: 0.6923076923076923, Recall: 0.9473684210526315


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  6.61it/s]


Epoch 26, Loss: 0.5067117276944613
Accuracy: 0.7567567567567568, F1-Score: 0.8, Precision: 0.6923076923076923, Recall: 0.9473684210526315


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  6.59it/s]


Epoch 27, Loss: 0.6594159971726569
Accuracy: 0.7567567567567568, F1-Score: 0.7804878048780488, Precision: 0.7272727272727273, Recall: 0.8421052631578947


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:02<00:00,  6.38it/s]


Epoch 28, Loss: 0.4615225517436078
Accuracy: 0.8108108108108109, F1-Score: 0.8372093023255814, Precision: 0.75, Recall: 0.9473684210526315


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:03<00:00,  5.16it/s]


Epoch 29, Loss: 0.45494740730837774
Accuracy: 0.7837837837837838, F1-Score: 0.8181818181818182, Precision: 0.72, Recall: 0.9473684210526315


  with autocast():  # Mixed precision
100%|██████████| 19/19 [00:03<00:00,  5.35it/s]


Epoch 30, Loss: 0.44904352567697825
Accuracy: 0.7567567567567568, F1-Score: 0.8, Precision: 0.6923076923076923, Recall: 0.9473684210526315
