In [2]:
import pandas as pd
from sklearn.utils import shuffle

# prevent warnings of bert from showing up
import logging
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


fake = pd.read_csv("../data/Fake.csv")
fake["label"] = 0

true = pd.read_csv("../data/True.csv")
true["label"] = 1

data = pd.concat([fake, true], ignore_index=True)
data = shuffle(data)

In [3]:
train = pd.read_csv("../data/train.csv")

train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
train["X"] = train["author"] + "[SEP]" + train["title"] + "[SEP]" + train["text"]


In [5]:
# remove row if a value is nan
train = train.dropna()

In [6]:
train["X"].isnull().sum()


0

In [7]:
len(train)

18285

In [3]:
data["X"] = data["subject"] + "[SEP]" + data["title"] + "[SEP]" + data["text"]

data.head(5)

Unnamed: 0,title,text,subject,date,label,X
2151,Paul Ryan Exposed As SPINELESS As Leaked Audi...,It s no secret that Speaker of the House Paul ...,News,"March 14, 2017",0,News[SEP] Paul Ryan Exposed As SPINELESS As Le...
3533,‘CLOWN SHOW’: Former CIA Official Takes Shots...,"Philip Mudd, a former CIA official, totally to...",News,"December 6, 2016",0,News[SEP] ‘CLOWN SHOW’: Former CIA Official Ta...
11755,#Berkeley IRONY ALERT! ANARCHISTS LOOT STARBUC...,Smashing windows How progressive!Protests aga...,politics,"Feb 2, 2017",0,politics[SEP]#Berkeley IRONY ALERT! ANARCHISTS...
33950,Pennsylvania governor raises minimum wage for ...,"HARRISBURG, Pa. (Reuters) - Pennsylvania Gover...",politicsNews,"March 7, 2016",1,politicsNews[SEP]Pennsylvania governor raises ...
7927,LEAKED AUDIO: MSNBC Worked With Trump During ...,For those of us who watched the shameful MSNBC...,News,"February 22, 2016",0,News[SEP] LEAKED AUDIO: MSNBC Worked With Trum...


In [4]:
data["subject"].value_counts()

politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: subject, dtype: int64

In [6]:
len(data)

44898

In [2]:
import numpy as np
import torch
import time

from transformers import BertModel, BertTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler


# Get pytorch device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model   

class CustomBERTModel(torch.nn.Module):
    def __init__(self, bert_model, BERT_MODEL = "bert-base-uncased"):
        super(CustomBERTModel, self).__init__()

        self.bert_model = BertModel.from_pretrained(BERT_MODEL, output_hidden_states=True)

        # set a linear layer to map the hidden states to 64 dimensions
        self.linear = torch.nn.Linear(768, 64)
        # set another linear layer to map the 64 dimensions to 2 dimensions
        self.linear2 = torch.nn.Linear(64, 2)
        # set a dropout layer
        self.dropout = torch.nn.Dropout(0.1)
        # set a relu activation function
        self.relu = torch.nn.ReLU()


    def forward(self, input_ids):
        outputs = self.bert_model(input_ids)

        # pass the last hidden state of the token `[CLS]` to the linear layer
        x = self.linear(outputs[0][:,0,:])

        # pass the output of the linear layer to the relu activation function
        x = self.relu(x)

        # pass the output of the relu activation function to the dropout layer
        x = self.dropout(x)
        
        # pass the output of the dropout layer to the second linear layer
        x = self.linear2(x)

        # set a softmax activation function
        x = torch.nn.functional.softmax(x, dim=1)

        return x

In [3]:
# define flat_accuracy function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = np.argmax(labels, axis=1).flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# define format_time function
def format_time(elapsed):
    import datetime
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def train(sentences, labels, lower = False):

    # set epochs
    EPOCHS = 10

    BATCH_SIZE = 8

    input_ids = []
    targets = []

    # define bert tokenizer
    tokenizer = BertTokenizer.# Get pytorch device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
e_plus(
                sentences[i],
                add_special_tokens=True,
                truncation=True,
                padding="max_length",
                max_length=64,
                return_tensors="pt",
            ).to(device)

        input_ids.append(inputs["input_ids"].squeeze(0))

        # convert labels to one-hot encoding
        target = torch.zeros(2)
        target[labels[i]] = 1
        targets.append(target)
    
    # convert to tensors
    input_ids = torch.stack(input_ids, dim=0)
    targets = torch.stack(targets, dim=0)

    # create dataset
    dataset = TensorDataset(input_ids, targets)
    # create dataloader
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)


    # define model
    model = CustomBERTModel(BertModel, BERT_MODEL = "bert-base-uncased").to(device)

    # set adamw optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    # set loss function
    loss_fn = torch.nn.CrossEntropyLoss()

    # set number of training steps
    total_steps = len(dataloader) * EPOCHS
    # set scheduler
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-5, steps_per_epoch=len(dataloader), epochs=EPOCHS)

    # start training clock
    start_time = time.time()

    # train model
    for epoch in range(EPOCHS):
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        print("-" * 10)

        model.train()

        total_loss = 0
        
        for step, batch in enumerate(dataloader):
            input_ids = batch[0].to(device)
            labels = batch[1].to(device)

            logits = model(input_ids)

            loss = loss_fn(logits, labels)

            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()
            model.zero_grad()
        
        avg_train_loss = total_loss / len(dataloader)
        print("  Average training loss: {0:.2f}".format(avg_train_loss))

        # save model
        torch.save(model.state_dict(), f"../models/bert_{epoch}.pt")

        # # evaluate model
        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in dataloader:
            input_ids = batch[0].to(device)
            labels = batch[1].to(device)
            logits = model(input_ids)

            loss = loss_fn(logits, labels)

            total_eval_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()


            total_eval_accuracy += flat_accuracy(logits, label_ids)
            
        avg_val_accuracy = total_eval_accuracy / len(dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        avg_val_loss = total_eval_loss / len(dataloader)
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))

        print("  Training epcoh took: {:}".format(format_time(time.time() - start_time)))

sentences = data["title"].values[:1000]
labels = data["label"].values[:1000]
train(sentences, labels)



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/10
----------
  Average training loss: 0.68
  Accuracy: 0.81
  Validation Loss: 0.61
  Training epcoh took: 0:00:28
Epoch 2/10
----------


KeyboardInterrupt: 

In [None]:
import torch

from transformers import BertModel, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# define predict function
def predict(sentence, lower = False):
    # define bert tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    if lower:
        sentence = sentence.lower()
    
    inputs = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            truncation=True,
            padding="max_length",
            max_length=64,
            return_tensors="pt",
        ).to(device)

    input_ids = inputs["input_ids"]

    # define model
    model = CustomBERTModel(BertModel, BERT_MODEL = "bert-base-uncased").to(device)

    # load model
    model.load_state_dict(torch.load("../models/bert_9.pt"))

    # set model to evaluation mode
    model.eval()

    logits = model(input_ids)

    logits = logits.detach().cpu().numpy()

    return np.argmax(logits, axis=1)

# predict
predict(data["title"][0], lower = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([1])

: 