In [9]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm.notebook as tqdm
import nltk
# from google.colab import files

In [10]:
import pandas as pd
import numpy as np
import sys
from functools import partial
import time

In [11]:
# General util functions
def make_dir_if_not_exists(directory):
	if not os.path.exists(directory):
		logging.info("Creating new directory: {}".format(directory))
		os.makedirs(directory)

def print_list(l, K=None):
	# If K is given then only print first K
	for i, e in enumerate(l):
		if i == K:
			break
		print(e)
	print()

def remove_multiple_spaces(string):
	return re.sub(r'\s+', ' ', string).strip()

def save_in_pickle(save_object, save_file):
	with open(save_file, "wb") as pickle_out:
		pickle.dump(save_object, pickle_out)

def load_from_pickle(pickle_file):
	with open(pickle_file, "rb") as pickle_in:
		return pickle.load(pickle_in)

def save_in_txt(list_of_strings, save_file):
	with open(save_file, "w") as writer:
		for line in list_of_strings:
			line = line.strip()
			writer.write(f"{line}\n")

def load_from_txt(txt_file):
	with open(txt_file, "r") as reader:
		all_lines = list()
		for line in reader:
			line = line.strip()
			all_lines.append(line)
		return all_lines

## Load Dataset

In [62]:
dataset = 'Constraint' # 'Constraint' # 'Rumor' # 'Political' # 'Liar'
train_data = pd.read_csv(f'data/{dataset}_Train.csv')
val_data = pd.read_csv(f'data/{dataset}_Validation.csv')
test_data = pd.read_csv(f'data/{dataset}_Test.csv')

In [63]:
len(train_data), len(val_data), len(test_data)

(6420, 2140, 2140)

In [64]:
tmp_df = pd.concat([train_data, val_data, test_data], ignore_index=True)
tmp_df['length'] = tmp_df.content.apply(lambda x: len(x.split(' ')))
len(tmp_df), tmp_df.length.mean()

(10700, 26.983831775700935)

In [65]:
labels = train_data.label.unique()

mapping_l2i = {l:i for i,l in enumerate(labels)}
mapping_i2l = {i:l for i,l in enumerate(labels)}
mapping_l2i

{'real': 0, 'fake': 1}

In [66]:
val_data.label.value_counts().iloc[[0]].values[0]/len(val_data)

0.5233644859813084

In [67]:
test_data.label.value_counts().iloc[[0]].values[0]/len(test_data)

0.5233644859813084

In [36]:
train_data

Unnamed: 0.1,Unnamed: 0,label,content
0,0,T,2020 Indy 500 postponed from May to August due...
1,1,T,With fears over the spread of Coronavirus moun...
2,2,U,"""It's Winter-Time For Capital Flow"" - Outbreak..."
3,3,F,Cristiano Ronaldo will convert his hotels into...
4,4,T,Australia now has drive-thru coronavirus testi...
...,...,...,...
5020,5020,F,Anybody in U.S. that wants a test (for the cor...
5021,5021,U,@HuffPost Result of China's long-run policy: #...
5022,5022,F,Dawoodi Bohra youth were licking used utensils...
5023,5023,U,Wilbur Ross: Coronavirus Could ‘Accelerate The...


In [38]:
# train_data.iloc[[0]].content.values[0]
train_data.iloc[[5024]].content.values[0]

'More than 6,000 people are trapped on a cruise ship in Italy after a woman was suspected of having the coronavirus'

In [6]:
#Defining torch dataset class for disaster tweet dataset
class TweetDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.df['target'] = self.df.label.apply(lambda x: mapping_l2i[x])
        self.df['id'] = list(range(len(df)))
        self.df['content'] = self.df.content.apply(lambda x: ' '.join(x.split(' ')[:510]))
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]

In [7]:
#set up train, validation, and testing datasets
train_dataset = TweetDataset(train_data)
val_dataset   = TweetDataset(val_data)
test_dataset  = TweetDataset(test_data)

In [8]:
def transformer_collate_fn(batch, tokenizer):
  bert_vocab = tokenizer.get_vocab()
  bert_pad_token = bert_vocab['[PAD]']
  bert_unk_token = bert_vocab['[UNK]']
  bert_cls_token = bert_vocab['[CLS]']

  sentences, labels, masks = [], [], []
  for data in batch:
    tokenizer_output = tokenizer([data['content']])
    # print(tokenizer_output)
    tokenized_sent = tokenizer_output['input_ids'][0]
    mask = tokenizer_output['attention_mask'][0]
    
    # print(tokenized_sent)
    if len(tokenized_sent) > 512:
        tokenized_sent = tokenized_sent[:511]+tokenized_sent[-1:]
        mask = mask[:511]+mask[-1:]
    
    sentences.append(torch.tensor(tokenized_sent))
    labels.append(torch.tensor(data['target']))
    masks.append(torch.tensor(mask))
  sentences = pad_sequence(sentences, batch_first=True, padding_value=bert_pad_token)
  labels = torch.stack(labels, dim=0)
  masks = pad_sequence(masks, batch_first=True, padding_value=0.0)
  return sentences, labels, masks

In [9]:
#computes the amount of time that a training epoch took and displays it in human readable form
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [10]:
#count the number of trainable parameters in the model
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [11]:
#train a given model, using a pytorch dataloader, optimizer, and scheduler (if provided)
def train(model,
          dataloader,
          optimizer,
          device,
          clip: float,
          scheduler = None):

    model.train()

    epoch_loss = 0

    for batch in dataloader:
        sentences, labels, masks = batch[0], batch[1], batch[2]

        optimizer.zero_grad()

        output = model(sentences.to(device), masks.to(device))
        loss = F.cross_entropy(output, labels.to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        if scheduler is not None:
          scheduler.step()
          
        epoch_loss += loss.item()
        
    return epoch_loss / len(dataloader)

In [12]:
#calculate the loss from the model on the provided dataloader
def evaluate(model,
             dataloader,
             device):

    model.eval()

    epoch_loss = 0
    with torch.no_grad():
      for batch in dataloader:
          sentences, labels, masks = batch[0], batch[1], batch[2]
          output = model(sentences.to(device), masks.to(device))
          loss = F.cross_entropy(output, labels.to(device))
            
          epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [13]:
#calculate the prediction accuracy on the provided dataloader
def evaluate_acc(model,
                 dataloader,
                 device):

    model.eval()

    epoch_loss = 0
    with torch.no_grad():
      total_correct = 0
      total = 0
      for i, batch in enumerate(dataloader):
          
          sentences, labels, masks = batch[0], batch[1], batch[2]
          output = model(sentences.to(device), masks.to(device))
          output = F.softmax(output, dim=1)
          output_class = torch.argmax(output, dim=1)
          total_correct += torch.sum(torch.where(output_class == labels.to(device), 1, 0))
          total += sentences.size()[0]

    return total_correct / total

# Model Setup and Fine-tuning

In [14]:
#first, install the hugging face transformer package in your colab
# !pip install transformers
from transformers import get_linear_schedule_with_warmup
from tokenizers.processors import BertProcessing

In [15]:
# Do not change this line, as it sets the model the model that Hugging Face will load
# If you are interested in what other models are available, you can find the list of model names here:
# https://huggingface.co/transformers/pretrained_models.html
bert_model_name = 'distilbert-base-uncased' 
##YOUR CODE HERE##
from transformers import AutoModel
from transformers import AutoTokenizer
bert_model = AutoModel.from_pretrained(bert_model_name)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name, truncation=True, max_length=512)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
class TweetClassifier(nn.Module):
    def __init__(self,
                 bert_encoder: nn.Module,
                 enc_hid_dim=768, #default embedding size
                 outputs=2,
                 dropout=0.1):
        super().__init__()

        self.bert_encoder = bert_encoder

        self.enc_hid_dim = enc_hid_dim
        
        
        ### YOUR CODE HERE ### 
        self.pre_classifier = nn.Linear(enc_hid_dim, enc_hid_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout)
        self.out = nn.Linear(enc_hid_dim, outputs)

    def forward(self,
                src,
                mask):
        bert_output = self.bert_encoder(src, mask)

        ### YOUR CODE HERE ###
        # embed = bert_output.last_hidden_state[:, 0]
        embed = bert_output[0][:,0]
        return self.out(self.dropout(self.relu(self.pre_classifier(embed))))
    
    def inference(self,
                src,
                mask):
        bert_output = self.bert_encoder(src, mask)

        ### YOUR CODE HERE ###
        # embed = bert_output.last_hidden_state[:, 0]
        embed = bert_output[0][:,0]
        return self.out(self.dropout(self.relu(self.pre_classifier(embed)))), embed


In [17]:
def init_classification_head_weights(m: nn.Module, hidden_size=768):
    ### YOUR CODE STARTS HERE ###
    k = 1/hidden_size
    for name, param in m.named_parameters():
        if name == 'out.weight' or name == 'pre_classifier.weight':
            print(name)
            nn.init.uniform_(param.data, a=-1*k**0.5, b=k**0.5)
        elif name == 'out.bias' or name == 'pre_classifier.bias':
            print(name)
            nn.init.uniform_(param.data, 0)

In [18]:
#define hyperparameters
BATCH_SIZE = 10
LR = 1e-5 # 1e-4 # 1e-5
WEIGHT_DECAY = 0
N_EPOCHS = 5 #3
CLIP = 1.0

#define models, move to device, and initialize weights
# device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:3')

model = TweetClassifier(bert_model, outputs=len(mapping_l2i)).to(device)
model.to(device)
print('Model Initialized')

Model Initialized


In [19]:
#create pytorch dataloaders from train_dataset, val_dataset, and test_datset
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer), shuffle = True)
val_dataloader = DataLoader(val_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer))
test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer))

In [20]:
def save_embedding(dataloader, split='train', mode='pretrain'):
    embeddings, pred, gtlabels = [], [], []
    
    model.eval()

    epoch_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            sentences, labels, masks = batch[0], batch[1], batch[2]
            output, emb = model.inference(sentences.to(device), masks.to(device))
            embeddings.append(emb.cpu().detach())
            pred.append(output.cpu().detach())
            gtlabels.append(labels.cpu().detach())
    
    embeddings = torch.cat(embeddings, dim=0)
    pred = torch.cat(pred, dim=0)
    gtlabels = torch.cat(gtlabels)
    
    print(embeddings.shape, pred.shape, gtlabels.shape)
    
    torch.save(embeddings, f'emb/{dataset}_{split}_{mode}_emb.pt')
    torch.save(pred, f'emb/{dataset}_{split}_{mode}_pred.pt')
    torch.save(gtlabels, f'emb/{dataset}_{split}_{mode}_gt.pt')

In [21]:
save_embedding(test_dataloader, 'Test')
save_embedding(val_dataloader, 'Validation')
save_embedding(train_dataloader, 'Train')

Token indices sequence length is longer than the specified maximum sequence length for this model (683 > 512). Running this sequence through the model will result in indexing errors


torch.Size([1267, 768]) torch.Size([1267, 6]) torch.Size([1267])
torch.Size([1284, 768]) torch.Size([1284, 6]) torch.Size([1284])
torch.Size([10240, 768]) torch.Size([10240, 6]) torch.Size([10240])


In [22]:
optimizer = optim.Adam(model.parameters(), lr=LR)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps=N_EPOCHS*len(train_dataloader))

print(f'The model has {count_parameters(model):,} trainable parameters')

train_loss = evaluate(model, train_dataloader, device)
train_acc = evaluate_acc(model, train_dataloader, device)

valid_loss = evaluate(model, val_dataloader, device)
valid_acc = evaluate_acc(model, val_dataloader, device)

print(f'Initial Train Loss: {train_loss:.3f}')
print(f'Initial Train Acc: {train_acc:.3f}')
print(f'Initial Valid Loss: {valid_loss:.3f}')
print(f'Initial Valid Acc: {valid_acc:.3f}')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_dataloader, optimizer, device, CLIP, scheduler)
    end_time = time.time()
    train_acc = evaluate_acc(model, train_dataloader, device)
    valid_loss = evaluate(model, val_dataloader, device)
    valid_acc = evaluate_acc(model, val_dataloader, device)
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTrain Acc: {train_acc:.3f}')
    print(f'\tValid Loss: {valid_loss:.3f}')
    print(f'\tValid Acc: {valid_acc:.3f}')

The model has 66,955,010 trainable parameters
Initial Train Loss: 0.702
Initial Train Acc: 0.476
Initial Valid Loss: 0.701
Initial Valid Acc: 0.477
Epoch: 01 | Time: 0m 48s
	Train Loss: 0.235
	Train Acc: 0.972
	Valid Loss: 0.152
	Valid Acc: 0.958
Epoch: 02 | Time: 0m 49s
	Train Loss: 0.101
	Train Acc: 0.989
	Valid Loss: 0.120
	Valid Acc: 0.970
Epoch: 03 | Time: 0m 48s
	Train Loss: 0.054
	Train Acc: 0.994
	Valid Loss: 0.124
	Valid Acc: 0.974
Epoch: 04 | Time: 0m 49s
	Train Loss: 0.030
	Train Acc: 0.998
	Valid Loss: 0.135
	Valid Acc: 0.971
Epoch: 05 | Time: 0m 45s
	Train Loss: 0.019
	Train Acc: 0.998
	Valid Loss: 0.143
	Valid Acc: 0.972


In [22]:
model.load_state_dict(torch.load(f'model/{dataset}.pt'))

<All keys matched successfully>

In [23]:
#run this cell and save its outputs to receive full credit for this implementation
test_loss = evaluate(model, test_dataloader, device)
test_acc = evaluate_acc(model, test_dataloader, device)
print(f'Test Loss: {test_loss:.3f}')
print(f'Test Acc: {test_acc:.3f}')

Test Loss: 1.721
Test Acc: 0.275


In [24]:
torch.save(model.state_dict(), f'model/{dataset}.pt')

In [24]:
save_embedding(test_dataloader, 'Test', 'finetune')
save_embedding(val_dataloader, 'Validation', 'finetune')
save_embedding(train_dataloader, 'Train', 'finetune')

torch.Size([1267, 768]) torch.Size([1267, 6]) torch.Size([1267])
torch.Size([1284, 768]) torch.Size([1284, 6]) torch.Size([1284])
torch.Size([10240, 768]) torch.Size([10240, 6]) torch.Size([10240])
