In [111]:
!pip install readability-lxml
!pip install lxml[html_clean]



In [112]:
!pip install readability




In [113]:
!pip install nrclex




In [114]:
from nrclex import NRCLex


In [115]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob
from readability import Document # Changed import from Readability to Document
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [116]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from readability import Document
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm

In [117]:
# Download punkt_tab instead of punkt
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [118]:

# Load and Concatenate BuzzFeed Dataset
def load_buzzfeed_data():
    # Replace with actual paths to true and fake files
    true_data = pd.read_csv("/content/True.csv")
    fake_data = pd.read_csv("/content/Fake.csv")

    true_data['label'] = 1
    fake_data['label'] = 0

    data = pd.concat([true_data, fake_data], ignore_index=True)
    return data

In [119]:
# Preprocessing Text
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

In [120]:
def extract_features(data):
    data['word_count'] = data['text'].apply(lambda x: len(word_tokenize(x)))
    data['sentence_count'] = data['text'].apply(lambda x: len(sent_tokenize(x)))
    data['sentiment'] = data['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Readability
    def get_readability_score(text):
        try:
            r = Document(text)
            return r.flesch_kincaid().score
        except:
            return np.nan

    data['readability'] = data['text'].apply(get_readability_score)

    return data

In [121]:

# Dataset Preparation
class BuzzFeedDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, features, max_length=128):
        self.texts = texts
        self.labels = labels
        self.features = features
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        features = self.features[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'features': torch.tensor(features, dtype=torch.float32),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [122]:
# Model with fixed classifier input size
class RoBERTaWithFeatures(torch.nn.Module):
    def __init__(self, base_model, num_features, num_labels=2):
        super(RoBERTaWithFeatures, self).__init__()
        self.roberta = base_model
        # The classifier needs to handle both the RoBERTa output and the additional features
        self.classifier = torch.nn.Linear(self.roberta.config.hidden_size + num_features, num_labels)

    def forward(self, input_ids, attention_mask, features):
        # Get the hidden states from RoBERTa (with output_hidden_states=True)
        outputs = self.roberta(input_ids, attention_mask=attention_mask, output_hidden_states=True)
        hidden_states = outputs.hidden_states  # Hidden states include all layers

        # The first token ([CLS]) is the pooled output for classification
        pooled_output = hidden_states[-1][:, 0, :]  # Take the [CLS] token from the last layer

        # Concatenate RoBERTa output with additional features
        combined_input = torch.cat((pooled_output, features), dim=1)

        # Pass through classifier layer
        logits = self.classifier(combined_input)
        return logits

In [123]:

# Training Function with Mixed Precision and Gradient Accumulation
def train_model(model, train_loader, val_loader, optimizer, epochs, device):
    model.to(device)
    scaler = GradScaler()  # For mixed precision training
    gradient_accumulation_steps = 4  # Adjust this value based on your memory limitations

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()  # Zero the gradients at the start of each epoch
        for step, batch in enumerate(tqdm(train_loader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            features = batch['features'].to(device)
            labels = batch['label'].to(device)

            with autocast():  # Mixed precision
                outputs = model(input_ids, attention_mask=attention_mask, features=features)
                loss = torch.nn.CrossEntropyLoss()(outputs, labels)

            scaler.scale(loss).backward()

            # Accumulate gradients over multiple steps
            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

        # Clear GPU cache after each epoch
        torch.cuda.empty_cache()

        evaluate_model(model, val_loader, device)

In [124]:
# Evaluation Function
def evaluate_model(model, val_loader, device):
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            features = batch['features'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, features=features)
            preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Compute evaluation metrics (Accuracy, F1, Precision, Recall)
    acc = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds)
    precision = precision_score(true_labels, preds)
    recall = recall_score(true_labels, preds)
    print(f"Accuracy: {acc}, F1-Score: {f1}, Precision: {precision}, Recall: {recall}")

In [None]:

# Main Execution
def main():
    data = load_buzzfeed_data()
    data['text'] = data['text'].apply(preprocess_text)
    data = extract_features(data)

    # Specify feature columns explicitly, excluding 'text'
    feature_columns = ['word_count', 'sentence_count', 'sentiment', 'readability']
    features = data[feature_columns].values

    # Fill NaN values with 0 to ensure numeric conversion works
    features = np.nan_to_num(features, nan=0.0).astype(np.float32)
    labels = data['label'].values
    texts = data['text']

    # Train-test split
    train_texts, val_texts, train_features, val_features, train_labels, val_labels = train_test_split(
        texts, features, labels, test_size=0.2, random_state=42
    )

    # Load tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    base_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    # Load the custom model with additional features
    model = RoBERTaWithFeatures(base_model, num_features=len(feature_columns))

    # Prepare dataset and dataloaders
    train_dataset = BuzzFeedDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, train_features)
    val_dataset = BuzzFeedDataset(val_texts.tolist(), val_labels.tolist(), tokenizer, val_features)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=1e-5)

    # Train and evaluate
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_model(model, train_loader, val_loader, optimizer, epochs=30, device=device)

if __name__ == "__main__":
    main()
