**Set up the Kaggle Environment**
The environment needs to be set up to ensure that kaggle is connected with Google Colab notebook, and ready to download the dataset

In [1]:
!pip install kaggle transformers
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [2]:
#Imports
import os
import sys
import shutil
import zipfile
import torch
import torch.nn as nn
from torch.utils.data import Dataset, IterableDataset, DataLoader
import pandas as pd
from transformers import AutoModelForSequenceClassification, AdamW,AutoTokenizer
from torch.utils.data import DataLoader

**Download the Dataset**
The dataset is downloaded from kaggle.com for 'Fake and Real News Dataset'

1. *Move the DataSet to Working Directory*
2. *Unzip the file*
3. *Build the Train/Test dataset*


In [3]:
!kaggle datasets download -d 'clmentbisaillon/fake-and-real-news-dataset';

Downloading fake-and-real-news-dataset.zip to /content
100% 41.0M/41.0M [00:00<00:00, 224MB/s]
100% 41.0M/41.0M [00:00<00:00, 212MB/s]


In [4]:
def data_load():
        shutil.move('/content/fake-and-real-news-dataset.zip','/root/.kaggle/' )
        with zipfile.ZipFile('/root/.kaggle/fake-and-real-news-dataset.zip') as zip_data:
                zip_data.extractall('/root/.kaggle/')
        os.remove('/root/.kaggle/fake-and-real-news-dataset.zip')
        del zip_data
        
class DatasetBuilder:
    def __init__(self, data_dir, data):
        self.truth = pd.read_csv(os.path.join(data_dir, data[0]), delimiter=',', error_bad_lines=False)
        self.false = pd.read_csv(os.path.join(data_dir, data[1]), delimiter=',', error_bad_lines=False)
        self.__blend(data_dir)
        self.__write()
    def __blend(self,data_dir):
        self.truth['Label'] = 1
        self.false['Label'] = 0
        truth_train = self.truth[['text', 'Label']].sample(frac=0.8, random_state=1)
        false_train = self.false[['text', 'Label']].sample(frac=0.8, random_state=1)
        truth_test = self.truth.drop(truth_train.index)[['text', 'Label']]
        false_test = self.false.drop(false_train.index)[['text', 'Label']]
        self.train = truth_train.append(false_train, ignore_index=True).sample(frac=1, random_state=1).reset_index(
            drop=True)
        self.test = truth_test.append(false_test, ignore_index=True).sample(frac=1, random_state=1).reset_index(
            drop=True)
        del self.truth
        del self.false
        os.remove(os.path.join(data_dir, 'True.csv'))
        os.remove(os.path.join(data_dir, 'Fake.csv'))
        return

    def __write(self):
        f= open('/root/.kaggle/Train.csv','w+')
        self.train.to_csv(f, sep=',', header=True, index=False)
        f.close()
        f= open('/root/.kaggle/Test.csv', 'w+')
        self.test.to_csv(f, sep=',', header=True, index=False)
        f.close()


In [5]:
!touch ~/.kaggle/Train.csv
!chmod 600 ~/.kaggle/Train.csv
!touch ~/.kaggle/Test.csv
!chmod 600 ~/.kaggle/Test.csv


In [6]:
data_load()

In [7]:
DatasetBuilder('/root/.kaggle/',['True.csv','Fake.csv'])



  """Entry point for launching an IPython kernel.


<__main__.DatasetBuilder at 0x7fcf4a7f2150>

**Streaming the Data **

As the train and test dataset is created, we need to build another piece of 'glue logic' to ensure training data is passed to the model coherently

In [8]:
class NewsProducer(Dataset):
    def __init__(self, path, filename):
        self.data = pd.read_csv(os.path.join(path, filename), delimiter=',', error_bad_lines=False)

    def __len__(self):
        return (len(self.data))

    def __getitem__(self, item):
        if torch.is_tensor(item):
            item = item.tolist()
        news = self.data.loc[item, 'text']
        label = self.data.loc[item, 'Label']
        return (news, label)

In [9]:
class Tokenizer:
    def __init__(self):
        super(Tokenizer).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [10]:
class SequenceModel(nn.Module):
    def __init__(self):
        super(SequenceModel, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
        self.model.classifier = nn.Linear(self.model.bert.pooler.dense.out_features, 1, True)
        self.model.config.num_labels = 1
        self.optimizer = AdamW(self.model.parameters())
        self.loss_fn = torch.nn.BCEWithLogitsLoss()
        self.loss_statistic = None

    def forward(self,input, train=True):
        if not train:
            self.model.eval()
        else:
            self.model.train()
        return self.model.forward(input)

    def backward(self):
        self.optimizer.zero_grad()
        self.loss_statistic.backward()
        self.optimizer.step()

    def loss(self, X, Y):
        y_pred = self.forward(X)
        self.loss_statistic = self.loss_fn(y_pred.logits.view(-1, self.model.config.num_labels), Y.float().view(-1, self.model.config.num_labels))
        return self.loss_statistic


In [11]:
training_dataset = NewsProducer('/root/.kaggle/', 'Train.csv')
train_loader = DataLoader(training_dataset, batch_size=5, shuffle=True, num_workers=0)
tk_zer = Tokenizer()
seq_model = SequenceModel()
loss_history=[]
avg_loss=0
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if device.type=='cuda':
  seq_model.to(device)
TRAINING_EPOCHS = 3
news_counter = 0
for training_iter in range(TRAINING_EPOCHS):
    news_counter = 0
    for batch in train_loader:
        news = list(batch[0])
        labels = list(batch[1])
        news_counter+=5
        assert len(news) == len(labels)
        tokens_batch = tk_zer.tokenizer(news, padding=True, truncation=True, return_tensors="pt")
        tokens_batch["labels"] = torch.tensor(labels)
        if device.type == 'cuda':
          tokens_batch.to(device)        
        pred = seq_model(tokens_batch['input_ids'])
        loss = seq_model.loss(tokens_batch['input_ids'], tokens_batch['labels'])
        seq_model.backward()
        avg_loss += loss.item()
     
    loss_history.append(avg_loss/news_counter)
    print(f"Training_Epoch: {training_iter},Average Loss: {avg_loss},News Counter: {news_counter}, loss: {loss_history[-1]:>7f}")
    avg_loss=0



  """Entry point for launching an IPython kernel.


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Training_Epoch: 0,Average Loss: 5262.1072483584285,News Counter: 35920, loss: 0.029299
Training_Epoch: 1,Average Loss: 5274.442987695336,News Counter: 35920, loss: 0.029368
Training_Epoch: 2,Average Loss: 5294.987706407905,News Counter: 35920, loss: 0.029482


In [15]:
torch.save(seq_model, '/root/.kaggle/Trained_Bert.pth')

In [34]:
with torch.no_grad():
  test_dataset = NewsProducer('/root/.kaggle/', 'Test.csv')
  test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, num_workers=2)
  seq_model.eval()
  total_acc, total_count = 0, 0
  for batch in test_loader:
    news,labels = list(batch[0]),list(batch[1])
    tokens_batch = tk_zer.tokenizer(news, padding=True, truncation=True, return_tensors="pt")
    tokens_batch['labels'] = torch.tensor(labels)
    tokens_batch.to(device)
    pred = seq_model(tokens_batch['input_ids'])
    loss = seq_model.loss(tokens_batch['input_ids'],tokens_batch['labels'])
    total_acc += (torch.round(pred.logits) == tokens_batch['labels']).sum().item()
    total_count += tokens_batch['labels'].size(0)
  accuracy = total_acc/total_count
  print(f"The Accuracy of the Model is : {accuracy:2f}%")


    




  


47.617663436908344
