<a href="https://colab.research.google.com/github/roshan-d21/Fake-News-Detector/blob/master/BERT/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import torch.utils.data as data_utils
import torch.optim as optim
import gc #garbage collector for gpu memory 
from tqdm import tqdm

In [2]:
%%capture
!pip install transformers

In [3]:
%%capture
from transformers import BertForSequenceClassification, BertTokenizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
import pandas as pd
news_data = pd.read_csv('fake_news.csv')
news_data.dropna(inplace=True)
news_data['target'] = news_data.apply(lambda row: 1 if row.Label else 0, axis=1)
news_data.head(10)

Unnamed: 0,Statement,Label,target
0,Says he paid one dollar for a sweater at Kohls.,True,1
1,"A DWI costs $17,000.",False,0
2,"When you have 8,000 veterans a year committing...",True,1
3,"When Disney was looking to build Disney World,...",False,0
4,Fifty percent of kids coming out of school can...,True,1
5,I dont understand what Republicans are doing a...,True,1
6,Gov. Scott Walkers plan to freeze pay for stat...,False,0
7,"As district attorney, Susan Happ took $180,000...",False,0
8,"In Wisconsin, 300,000 voters were turned away ...",False,0
9,78702 is the second-most gentrified ZIP code i...,False,0


In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [6]:
tokenized_df = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], news_data['Statement']))

In [7]:
totalpadlength = 512

In [8]:
indexed_tokens = list(map(tokenizer.convert_tokens_to_ids, tokenized_df))

In [9]:
index_padded = np.array([xi + [0] * (totalpadlength - len(xi)) for xi in indexed_tokens])

In [10]:
target_variable = news_data['target'].values

In [11]:
all_words = []
for l in tokenized_df:
  all_words.extend(l)
all_indices = []
for i in indexed_tokens:
  all_indices.extend(i)

word_to_ix = dict(zip(all_words, all_indices))
ix_to_word = dict(zip(all_indices, all_words))

In [12]:
mask_variable = [[float(i>0) for i in j] for j in index_padded]

In [13]:
BATCH_SIZE = 8
def format_tensors(text_data, mask, labels, batch_size):
    X = torch.from_numpy(text_data)
    X = X.long()
    mask = torch.tensor(mask)
    y = torch.from_numpy(labels)
    y = y.long()
    tensordata = data_utils.TensorDataset(X, mask, y)
    loader = data_utils.DataLoader(tensordata, batch_size=batch_size, shuffle=False)
    return loader

X_train, X_test, y_train, y_test = train_test_split(index_padded, target_variable, 
                                                    test_size=0.1, random_state=42)

train_masks, test_masks, _, _ = train_test_split(mask_variable, index_padded, 
                                                       test_size=0.1, random_state=42)

trainloader = format_tensors(X_train, train_masks, y_train,BATCH_SIZE)
testloader = format_tensors(X_test, test_masks, y_test, BATCH_SIZE)

In [14]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
def compute_accuracy(model, dataloader, device):
    tqdm()
    model.eval()
    correct_preds, num_samples = 0,0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            token_ids, masks, labels = tuple(t.to(device) for t in batch)
            output = model(input_ids=token_ids, attention_mask=masks, labels=labels)
            yhat = output.logits
            prediction = (torch.sigmoid(yhat[:,1]) > 0.5).long()
            num_samples += labels.size(0)
            correct_preds += (prediction==labels.long()).sum()
            del token_ids, masks, labels #memory
        torch.cuda.empty_cache() #memory
        gc.collect() # memory
        return correct_preds.float()/num_samples *100

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache() #memory
gc.collect() #memory
NUM_EPOCHS = 1
loss_function = nn.BCEWithLogitsLoss()
losses = []
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-6)
for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    iteration = 0
    for i, batch in enumerate(trainloader):
        iteration += 1
        token_ids, masks, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        output = model(input_ids=token_ids, attention_mask=masks, labels=labels)
        loss = output.loss
        loss.backward()
        optimizer.step()
        running_loss += float(loss.item())
        del token_ids, masks, labels #memory
    
        if not i%25:
            print(f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                  f'Batch {i+1:03d}/{len(trainloader):03d} | '
                  f'Average Loss in last {iteration} iteration(s): {(running_loss/iteration:.4f}')
            running_loss = 0.0
            iteration = 0
        torch.cuda.empty_cache() #memory
        gc.collect() #memory
        losses.append(float(loss.item()))
    # with torch.set_grad_enabled(False):
    #     print(f'\nTraining Accuracy: '
    #           f'{compute_accuracy(model, trainloader, device):.2f}%')

Epoch: 001/001 | Batch 001/249 | Average Loss in last 1 iteration(s): 0.7017
Epoch: 001/001 | Batch 026/249 | Average Loss in last 25 iteration(s): 0.6783
Epoch: 001/001 | Batch 051/249 | Average Loss in last 25 iteration(s): 0.6465
Epoch: 001/001 | Batch 076/249 | Average Loss in last 25 iteration(s): 0.6089
Epoch: 001/001 | Batch 101/249 | Average Loss in last 25 iteration(s): 0.5898
Epoch: 001/001 | Batch 126/249 | Average Loss in last 25 iteration(s): 0.5505
Epoch: 001/001 | Batch 151/249 | Average Loss in last 25 iteration(s): 0.5341
Epoch: 001/001 | Batch 176/249 | Average Loss in last 25 iteration(s): 0.5113
Epoch: 001/001 | Batch 201/249 | Average Loss in last 25 iteration(s): 0.4908
Epoch: 001/001 | Batch 226/249 | Average Loss in last 25 iteration(s): 0.4605


In [17]:
with torch.set_grad_enabled(False):
    print(f'\nTraining Accuracy: 'f'{compute_accuracy(model, trainloader, device):.2f}%')

0it [00:00, ?it/s]
100%|██████████| 249/249 [02:14<00:00,  1.85it/s]



Training Accuracy: 67.81%


In [19]:
with torch.set_grad_enabled(False):
  print(f'\n\nTest Accuracy:'
  f'{compute_accuracy(model, testloader, device):.2f}%')

0it [00:00, ?it/s]
100%|██████████| 28/28 [00:14<00:00,  1.90it/s]




Test Accuracy:65.39%
