In [5]:
import ijson
import itertools
import random
import numpy as np
import sys, os
import pandas as pd 
import torch
from torchsummary import summary
from torchtext import data
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, TensorDataset,DataLoader
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from tqdm import tqdm, tqdm_notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
# from apex import amp
import shutil

%load_ext autoreload
%autoreload 2
%matplotlib inline


SEED = 7219
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<torch._C.Generator at 0x7faf7098ea70>

In [6]:
# Import transformers specific packages
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import  BertForSequenceClassification, BertForTokenClassification
from transformers import AdamW,get_linear_schedule_with_warmup

In [7]:
# Set the device and empty cache 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# cudnn.benchmark = True
torch.cuda.empty_cache()

In [19]:
# Class for model training and inference
class Bert_Model():
    def __init__(self,train_df,test_df,bert_model_name,bert_model_path,
                tokenizer,
                max_seq_length=128,seed=1234):
        
        if max_seq_length > tokenizer.max_model_input_sizes[bert_model_name]:
            print("Max sequence length specified > 512!!... resetting to 128")
            print("If you don't want this then set max_seq_length to <= 512")
            self._MAX_SEQUENCE_LENGTH = 128
        else:
            self._MAX_SEQUENCE_LENGTH = max_seq_length
        self._SEED = seed
        self._WORK_DIR = "/Users/suhasgupta/w251/mids-w251-final-project/models/Tranformer_based/workingdir/"
        self._bert_model_path=bert_model_path
        self._bert_model_name=bert_model_name
        self._train_data=train_df
        self._test_size=test_df
        self._tokenizer = tokenizer

    def tokenize(self,text_array):
        ''' Returns tokenized IDs'''
#         all_tokens=torch.zeros((len(text_array),self._MAX_SEQUENCE_LENGTH))
        all_tokens=[]
        for i,text in enumerate(tqdm_notebook(text_array)):
            tokens = torch.tensor(
                            tokenizer.encode(
                                   text, 
                                   add_special_tokens=True,
                                   max_length=self._MAX_SEQUENCE_LENGTH,
                                   pad_to_max_length=True))
#             all_tokens[i]=tokens
            all_tokens.append(tokens)
        return all_tokens

    def initialize_model_for_training(self,dataset_len,EPOCHS=1,model_seed=21000,lr=2e-5,batch_size=32,
                                      accumulation_steps=2):
        # Setup model parameters
        np.random.seed(model_seed)
        torch.manual_seed(model_seed)
        torch.cuda.manual_seed(model_seed)
        torch.backends.cudnn.deterministic = True

        # Empty cache
        torch.cuda.empty_cache()
        model = BertForSequenceClassification.from_pretrained(self._bert_model_path,cache_dir=None,num_labels=2)
        model.zero_grad()
        model = model.to(device)
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        num_train_optimization_steps = int(EPOCHS*dataset_len/batch_size/accumulation_steps)
        optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
        scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=1000,num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
#         model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
#         torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters,1.0)
        model=model.train()
        return model,optimizer,scheduler,EPOCHS

    def run_training(self,model,train_dataLoader,optimizer,scheduler,EPOCHS=1,batch_size=32,accumulation_steps=2):
        tq = tqdm_notebook(range(EPOCHS))
        for epoch in tq:
#             train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            avg_loss = 0.
            avg_accuracy = 0.
            lossf=None
            tk0 = tqdm_notebook(enumerate(train_dataLoader),total=len(train_dataLoader),leave=False)
            optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
            for i,(x_batch, y_batch) in tk0:
#                 y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)  
                outputs = model(x_batch.to(device), labels=y_batch.to(device))
                loss, y_pred = outputs
#                 loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
#                 with amp.scale_loss(loss, optimizer) as scaled_loss:
#                     scaled_loss.backward()
                loss.backward()
                if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
                    optimizer.step()                            # Now we can do an optimizer step
                    scheduler.step()
                    optimizer.zero_grad()
                if lossf:
                    lossf = 0.98*lossf+0.02*loss.item()
                else:
                    lossf = loss.item()
                tk0.set_postfix(loss = lossf)
                avg_loss += loss.item() / len(train_dataLoader)
#                 avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:])>0.5) == (y_batch[:]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
            tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
            return model


In [8]:
class CreateDataset(Dataset):
    def __init__(self,data,labels):
        self._dataset = [[train_data_tokenized[i],train_labels.values[i]] for i in range(0,len(train_data_tokenized))]
    
    def __len__(self):
        return len(self._dataset)

    def __getitem__(self,idx):
        return self._dataset[idx]


In [9]:
# Define constants and paths

seed = 7843
bert_model_name = "bert-base-uncased"
# Convert TF checkpoint to pytorch chckpoint and then use as input to class object
bert_model_path = "/Users/suhasgupta/w251/mids-w251-final-project/data/BERT/uncased_L-12_H-768_A-12-pytorch/"
# data_path = "/Users/suhasgupta/w251/mids-w251-final-project/data/nlp.cs.princeton.edu/SARC/2.0/pol/"
data_path = "/Users/suhasgupta/w251/mids-w251-final-project/data/nlp.cs.princeton.edu/SARC/2.0/files/"
work_dir = "/Users/suhasgupta/w251/mids-w251-final-project/models/Tranformer_based/workingdir/"

tokenizer = BertTokenizer.from_pretrained(bert_model_name, cache_dir=None,do_lower_case=True)
max_seq_len = 64

# Load and check the dataset from files on disk 
train_file_name = data_path+'balanced_train.csv'
test_file_name  = data_path+'balanced_test.csv'

all_train_df = pd.read_csv(train_file_name)
test_df = pd.read_csv(test_file_name)

# Create a train, valid split
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_train_df, test_size=0.2,random_state=seed)

train_data   = train_df.text.fillna("DUMMY_VALUE")
train_labels = train_df.label
valid_data  = valid_df.text.fillna("DUMMY_VALUE")
valid_labels = valid_df.label
test_data  = test_df.text.fillna("DUMMY_VALUE")
test_labels = test_df.label

train_size,valid_size,test_size = len(train_df),len(valid_df),len(test_df)
print(train_size,valid_size,test_size)

205665 51417 64666


In [11]:
# Create a model object 
bert_model1=Bert_Model(train_df=train_df,
                      test_df=test_df,
                      bert_model_name=bert_model_name,
                      bert_model_path=bert_model_path,
                      tokenizer=tokenizer,
                      max_seq_length=max_seq_len)

In [12]:
%timeit
train_data_tokenized = bert_model1.tokenize(train_data)
valid_data_tokenized = bert_model1.tokenize(valid_data)
test_data_tokenized = bert_model1.tokenize(test_data)


HBox(children=(IntProgress(value=0, max=205665), HTML(value='')))




HBox(children=(IntProgress(value=0, max=51417), HTML(value='')))




HBox(children=(IntProgress(value=0, max=64666), HTML(value='')))




In [13]:
train_dataset = CreateDataset(train_data_tokenized,train_labels)
valid_dataset = CreateDataset(valid_data_tokenized,valid_labels)

In [14]:
train_dataset[0]
valid_dataset[0]

[tensor([  101,  2106,  2017,  9530,  8873, 27390,  2063,  9779, 11140,  3512,
         19528,  3370,  1029,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]), 0]

[tensor([  101,  2106,  2017,  9530,  8873, 27390,  2063,  9779, 11140,  3512,
         19528,  3370,  1029,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]), 0]

In [16]:
params = {'batch_size': 32,
          'shuffle': False,
          'num_workers': 16}
max_epochs = 1
print("Expected number of batches:", int(len(train_data_tokenized)/params['batch_size']))
train_dataLoader = torch.utils.data.DataLoader(train_dataset,batch_size=params['batch_size'],
                                         shuffle=params['shuffle'],
                                         num_workers=params['num_workers'],
                                         pin_memory=False,drop_last=True)
print("Generated number of batches:%d" %len(train_dataLoader))

Expected number of batches: 6427
Generated number of batches:6427


#### Model Initialization and Training 

In [18]:
# Initialize the model
model,optimizer,scheduler,EPOCHS = bert_model1.initialize_model_for_training(len(train_dataLoader),EPOCHS=max_epochs)

In [None]:
# Train 
bert_model1.run_training(model,train_dataLoader,
                         optimizer=optimizer,scheduler=scheduler,EPOCHS=max_epochs)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6427), HTML(value='')))