In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [5]:
# Create a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [6]:
# Load the csv
fake_data = pd.read_csv('Fake.csv')
true_data = pd.read_csv('True.csv')

In [7]:
# combine data and create label
fake_data['label'] = 0
true_data['label'] = 1
data = pd.concat([fake_data, true_data], ignore_index=True)

In [None]:
# extract texts and labels
texts = data['text'].values
labels = data['label'].values

In [9]:
# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=.2, random_state=42
)

In [10]:
# Load a tokenizer (BERT)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

In [None]:
# create datasets and dataloaders
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

for batch in train_dataloader:
    print(batch['input_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['label'].shape)
    print(batch)
    break

In [None]:
# define model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# device config
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

In [None]:
model.to(device)

In [12]:
# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [13]:
# model training
def train_model(model, dataloader, optimizer, loss_fn, device):
    model.train()
    for epochs in range(2):
      total_loss = 0
      for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        Outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = Outputs.loss
        logits = Outputs.logits

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
      print(f'Epoch: {epochs+1}, Loss: {total_loss/len(dataloader)}')

In [14]:
# model evaluation function
def evaluate_model(model, dataloader, device):
  model.eval()
  Predictions, true_labels = [], []

  with torch.no_grad():
    for batch in dataloader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)

      outputs = model(input_ids, attention_mask=attention_mask)
      logits = outputs.logits
      preds = torch.argmax(logits, dim=1)

      Predictions.extend(preds.cpu().numpy())
      true_labels.extend(labels.cpu().numpy())

  accuracy = accuracy_score(true_labels, Predictions)
  print(f"Accuarcy: {accuracy}")
  print("Classification Report:")
  print(classification_report(true_labels, Predictions))

In [None]:
train_model(model, train_dataloader, optimizer, loss_fn, device)

In [None]:
# Save the entire model
torch.save(model, 'fake_news_model.pth')

# Save the model's state_dict
torch.save(model.state_dict(), 'fake_news_model_state_dict.pth')

In [None]:
evaluate_model(model, val_dataloader, device)

In [None]:
def predict_news(text, model, tokenizer, device, max_len):
  model.eval()
  encoding = tokenizer.encode_plus(
    text,
    max_length=max_len,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
  )

  input_ids = encoding['input_ids'].to(device)
  attention_mask = encoding['attention_mask'].to(device)

  with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).item()

  return "true" if preds == 1 else "fake"