# All the important imports

In [1]:
import os
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.optim import lr_scheduler
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn import metrics
import transformers
import tokenizers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm
import utils

In [2]:
total_df=pd.read_csv("../input/factcheckv1/fake_covid.csv")
mapping={'true':1,'false':0,'neutral':1}

In [3]:
total_df['label']=total_df['label'].map(mapping)

In [4]:
np.unique(total_df['label'],return_counts="True")

(array([0, 1]), array([601, 476]))

In [5]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 16
    EPOCHS = 2
    BERT_PATH = "../input/bert-base-uncased/"
    MODEL_PATH = "model.bin"
    TRAINING_FILE = "../input/tweet-train-folds/train_folds.csv"
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        f"{BERT_PATH}/vocab.txt", 
        lowercase=True
    )

# Data Processing

In [6]:
def process_data(tweet, sentiment, tokenizer, max_len):
  
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids[1:-1]
    tweet_offsets = tok_tweet.offsets[1:-1]
    
    
    input_ids = [102] + input_ids_orig + [102]
    token_type_ids = [0] + [1] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 1 + tweet_offsets + [(0, 0)]
    
    target=sentiment

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    if padding_length < 0:
        
        input_ids = input_ids[:max_len]
        mask = mask[:max_len]
        token_type_ids = token_type_ids[:max_len]
        tweet_offsets = tweet_offsets[:max_len]
        
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'target': target,
        'orig_tweet': tweet,
        'offsets': tweet_offsets
    }

# Data loader

In [7]:
class TweetDataset:
    def __init__(self, tweet, sentiment):
        self.tweet = tweet
        self.sentiment = sentiment
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item],  
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'target': torch.tensor(data["target"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

# The Model

In [8]:
class TweetModel(transformers.BertPreTrainedModel):
    """
    Model class that combines a pretrained bert model with a linear later
    """
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        # Load the pretrained BERT model
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH, config=conf)
        # Set 10% dropout to be applied to the BERT backbone's output
        self.drop_out = nn.Dropout(0.1)
        self.l0 = nn.Linear(768, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        # Return the hidden states from the BERT backbone
        _, out = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        ) # bert_layers x bs x SL x (768)
        
        
        out = self.drop_out(out) # bs x SL x (768 * 2)
        # The "dropped out" hidden vectors are now fed into the linear layer to output two scores
        logits = self.l0(out) # bs x SL x 2
        
        

        return logits

# Loss Function

In [9]:
def loss_fn(logits,label):
    loss_fct = nn.CrossEntropyLoss()
    return loss_fct(logits,label)

# Training Function

In [10]:
def train_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = utils.AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader))
    
    for bi, d in enumerate(tk0):

        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        target = d["target"]
        orig_tweet = d["orig_tweet"]
        offsets = d["offsets"]

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        target = target.to(device, dtype=torch.long)

        model.zero_grad()
        logit = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids,
        )
        
        loss = loss_fn(logit,target)
        loss.backward()
        optimizer.step()
        scheduler.step()

        losses.update(loss.item(), ids.size(0))
        
        tk0.set_postfix(loss=losses.avg)

In [11]:


def eval_fn(data_loader, model, device):
    model.eval()
    losses = utils.AverageMeter()
    total=0
    correct=0
    
    
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for bi, d in enumerate(tk0):

            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            target = d["target"]
            
            orig_tweet = d["orig_tweet"]
            offsets = d["offsets"]
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            target = target.to(device, dtype=torch.long)

            model.zero_grad()
            logit = model(
                ids=ids,
                mask=mask,
                token_type_ids=token_type_ids,
            )
            _, predicted = torch.max(logit.data, 1)
            
            total += target.size(0)
            correct += (predicted == target).sum().item()

            loss = loss_fn(logit,target)
            losses.update(loss.item(), ids.size(0))
            tk0.set_postfix(loss=losses.avg)
            
            

    
    return (100 * correct / total)

In [12]:
def run(df_train,df_valid):
    
    train_dataset = TweetDataset(
        tweet=df_train.text.values,
        sentiment=df_train.label.values
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = TweetDataset(
        tweet=df_valid.text.values,
        sentiment=df_valid.label.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = False
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )
    
    # I'm training only for 3 epochs even though I specified 5!!!
    for epoch in range(3):
        train_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        print(eval_fn(valid_data_loader, model, device))
    
    torch.save(model.state_dict(),"modelv1")


In [13]:

train, valid = train_test_split(total_df, test_size=0.05,stratify=total_df['label'])

In [14]:
run(train,valid)

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


53.7037037037037


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


70.37037037037037


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


70.37037037037037


In [15]:
device = torch.device("cpu")
model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)    
model_config.output_hidden_states = False

In [16]:
model1 = TweetModel(conf=model_config)

In [17]:

model1.to(device)
model1.load_state_dict(torch.load("../input/bert-base-uncased-using-pytorch/modelv1",map_location='cpu'))
model1.eval()


TweetModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [18]:
print(np.unique(train['label'],return_counts=True))
train

(array([0, 1]), array([571, 452]))


Unnamed: 0.1,Unnamed: 0,label,text
951,15565,0,Says wearing face masks is more harmful to you...
786,12604,0,;Not one illness has been reported from raw mi...
945,15413,1,;It used to be that the only children at schoo...
706,11711,0,";For too long, Obamacare has caused high premi..."
146,3341,1,Technology coming to Virginia allows COVID-19 ...
...,...,...,...
167,3764,1,Wisconsin bill ;grants drug companies and medi...
600,10497,1,Says Mike DeWine ;flip-flopped on the gun issu...
903,14778,0,;We don&rsquo;t have a (military) reserve forc...
725,11907,0,Video shows President Donald Trump saying COVI...


In [19]:
data=process_data(train['text'].values[-1],0,config.TOKENIZER,config.MAX_LEN)

ids=torch.tensor([data["ids"]], dtype=torch.long)
mask=torch.tensor([data["mask"]], dtype=torch.long)
token_type_ids=torch.tensor([data["token_type_ids"]], dtype=torch.long)
target=torch.tensor(data["target"], dtype=torch.long)
orig_tweet=data["orig_tweet"]
offsets=torch.tensor(data["offsets"], dtype=torch.long)

ids = ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
target = target.to(device, dtype=torch.long)

model1.zero_grad()
logit = model1(
    ids=ids,
    mask=mask,
    token_type_ids=token_type_ids,
)

logit= torch.softmax(logit, dim=1).cpu().detach().numpy()

print(logit)

[[0.4482183 0.5517817]]


In [20]:
for g in logit[0]:
    print("{0:.0}".format(g))

0.4
0.6
