# Fake News Detection with RoBERTa

In [None]:
# Install transformers and PyTorch Lightning libraries

!pip install transformers
!pip install pytorch-lightning

In [None]:
# Import required libraries

import pandas as pd
import re
import sklearn
import nltk
from sklearn.model_selection import train_test_split
from google.colab import drive


from transformers import RobertaTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
# Mount your Google Drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Read fake news and real news datasets

fake_news = pd.read_csv("gdrive/MyDrive/fake_news.csv")
real_news = pd.read_csv("gdrive/MyDrive/real_news.csv")
fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
# Fake news data gets a label of 1 and real news data gets a label of 0

fake_news["label"] = 1
real_news["label"] = 0
data = pd.concat([fake_news, real_news], axis=0) # Concatenate both the dataframes

In [None]:
# Randomly shuffle the concatenated dataframe

data = data.sample(frac=1).reset_index(drop=True)

In [None]:
# Take only the text and label columns

data = data[["text", "label"]]
data.head()

Unnamed: 0,text,label
0,Via: TMZ,1
1,BUCHAREST (Reuters) - Romania s leftist govern...,0
2,HAVANA (Reuters) - Cuba on Tuesday slammed U.S...,0
3,WASHINGTON (Reuters) - U.S. House Speaker Paul...,0
4,BEIRUT (Reuters) - Islamic State took control ...,0


In [None]:
# Clean the text

nltk.download("stopwords")
def clean_text(text):
  stopwords = nltk.corpus.stopwords.words('english')
  text = text.lower() # Convert to lower case
  text = re.sub(r'[^\w\s]', '', text) # Remove everything except words
  words = [word for word in text.split() if word not in stopwords] # Remove stopwords
  text = " ".join(words)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
data["text"] = data.loc[:,"text"].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
data.head()

Unnamed: 0,text,label
0,via tmz,1
1,bucharest reuters romania leftist government n...,0
2,havana reuters cuba tuesday slammed us preside...,0
3,washington reuters us house speaker paul ryan ...,0
4,beirut reuters islamic state took control syri...,0


In [None]:
X = data["text"].values
y = data["label"].values

# Split into training and validation sets

train_data, val_data, train_labels, val_labels = train_test_split(X, y)

## Tokenizer

In [None]:
# Write the list of sentences into a text file

with open("gdrive/MyDrive/data.txt", "w") as f:
  f.write("\n".join(data["text"].tolist()))

In [None]:
# Train a BPE tokenizer from scratch on the data

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

files = ["gdrive/MyDrive/data.txt"]
tokenizer.train(files, trainer)

In [None]:
# Load the pretrained tokenizer and add PAD token 

from transformers import PreTrainedTokenizerFast
trained_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
trained_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

##Using Pretrained Tokenizer

In [None]:
# Tokenize the articles

train_tokens = trained_tokenizer(list(train_data), return_tensors="pt", padding=True, truncation=True, max_length=64)
val_tokens = trained_tokenizer(list(val_data), return_tensors="pt", padding=True, truncation=True, max_length=64)

In [None]:
# Create lists of tokens

device = "cuda"
trn = [train_tokens["input_ids"].to(device), train_tokens["attention_mask"].to(device),
      train_tokens["token_type_ids"].to(device), torch.tensor(train_labels).to(device)]
val = [val_tokens["input_ids"].to(device), val_tokens["attention_mask"].to(device),
      val_tokens["token_type_ids"].to(device), torch.tensor(val_labels).to(device)]

In [None]:
# Dataloader class

BATCH_SIZE = 32
class ClassificationData(pl.LightningDataModule):
    def __init__(self, trn, val):
        super().__init__()

        self.trn = DataLoader(TensorDataset(*trn), batch_size=BATCH_SIZE)
        self.val = DataLoader(TensorDataset(*val), batch_size=BATCH_SIZE)

    def train_dataloader(self): 
      return self.trn
    def val_dataloader(self): 
      return self.val

dls = ClassificationData(trn, val)

In [None]:
# This should return a list of 4 tensors - input_ids, attention_masks, token_type_ids, and labels
next(iter(dls.trn))

[tensor([[18238,  2541,  2344,  ...,  3670,  2785,  6976],
         [ 6218,   327,   222,  ...,   530,  2423,  3831],
         [ 1841,   327,   686,  ...,  4797,  9975,  6091],
         ...,
         [14668,   327,   826,  ...,  1004,  1808,   420],
         [ 3634,  1490,   565,  ...,  7494,   528,  6359],
         [14165,   374,   160,  ...,     3,     3,     3]], device='cuda:0'),
 tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'),
 tensor([1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
         1, 0, 1, 1, 1, 0, 1, 1], device='cuda

*   input_ids of shape : 32x64
*   attention_masks of shape: 32x64
*   token_type_ids of shape: 32x64
*   labels of shape: 32x1

##Training the Model

In [None]:
from transformers import RobertaModel
roberta_model = RobertaModel.from_pretrained('roberta-base')

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# The RobertaClassifier class

#import torch
#import pytorch_lightning as pl

class RobertaClassifier(pl.LightningModule):
    def __init__(self, dropout_p, hid_dim, output_dim):
        super().__init__()
        self.roberta = roberta_model
        self.dropout = torch.nn.Dropout(dropout_p)
        self.linear_1 = torch.nn.Linear(hid_dim,hid_dim)
        self.linear_2 = torch.nn.Linear(hid_dim, output_dim)
        self.loss = torch.nn.NLLLoss()

    def forward(self, input_ids, attention_mask, token_ids):
        x1 = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_ids)[0]
        x1 = x1[:,0]
        x1 = self.dropout(torch.nn.ReLU()(self.linear_1(x1)))
        output  = torch.log_softmax(self.linear_2(x1), dim = 1)
        return output

    def training_step(self, batch, ix):
        pred = self(batch[0], batch[1], batch[2])
        loss = self.loss(pred, batch[3].view(-1))
        return loss

    def validation_step(self, batch, ix):
        pred = self(batch[0], batch[1], batch[2])
        loss = self.loss(pred, batch[3].view(-1))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

m = RobertaClassifier(0.5, 768, 2)

In [None]:
# Train the model

# dls is the object of the dataloader class created previously
device = "cuda"
t = pl.Trainer(max_epochs=1, gpus=1)
t.fit(m.to(device), dls)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type         | Params
------------------------------------------
0 | roberta  | RobertaModel | 124 M 
1 | dropout  | Dropout      | 0     
2 | linear_1 | Linear       | 590 K 
3 | linear_2 | Linear       | 1.5 K 
4 | loss     | NLLLoss      | 0     
------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params
500.951   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
# This should print the model architecture
print(m)

RobertaClassifier(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

##Evaluating the model


In [None]:
#Calculating the predction on validation data

device = "cuda"

pred = []
truth = []

for val_batch in iter(dls.val):
  m.to(device)

  val_pred = m(val_batch[0], val_batch[1], val_batch[2]) # model is created previously
  val_label = val_pred.data.max(1)[1].cpu().numpy()

  val_true = val_batch[3].cpu().numpy() # BATCH_SIZE is 32

  pred.extend(val_label)
  truth.extend(val_true)

In [None]:
# Calculate recall

recall_score = sklearn.metrics.recall_score(truth, pred)

print(recall_score)

0.9982938065176591


In [None]:
#Calculate classification report

from sklearn.metrics import classification_report
print(classification_report(truth,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5364
           1       1.00      1.00      1.00      5861

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225

