In [22]:
import ijson
import itertools
import random
import numpy as np
import sys, os
import pandas as pd 
import torch
from torchsummary import summary
from torchtext import data
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, TensorDataset,DataLoader
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from tqdm import tqdm, tqdm_notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')
import pickle
from docopt import docopt
import time
from torch.autograd import Variable
# from apex import amp
import shutil

%load_ext autoreload
%autoreload 2
%matplotlib inline


SEED = 7219
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<torch._C.Generator at 0x12fa81ed0>

In [23]:
# Import transformers specific packages
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import  BertForSequenceClassification, BertForTokenClassification
from transformers import AdamW, get_linear_schedule_with_warmup
#from transformers import WarmupLinearSchedule as get_linear_schedule_with_warmup
#

In [24]:
# Set the device and empty cache 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# cudnn.benchmark = True
torch.cuda.empty_cache()
device
#model.to(device)

device(type='cpu')

In [25]:
# Class for model training and inference
class Bert_Model():
    def __init__(self,train_df,test_df,bert_model_name,bert_model_path,
                tokenizer,
                max_seq_length=128,seed=1234):
        
        if max_seq_length > tokenizer.max_model_input_sizes[bert_model_name]:
            print("Max sequence length specified > 512!!... resetting to 128")
            print("If you don't want this then set max_seq_length to <= 512")
            self._MAX_SEQUENCE_LENGTH = 128
        else:
            self._MAX_SEQUENCE_LENGTH = max_seq_length
        self._SEED = seed
        self._WORK_DIR = "/Users/debalinamaiti/Documents/GraduateStudy/MIDS/W251/FinalProject/mids-w251-final-project/models/Tranformer_based/workingdir/"
        self._bert_model_path=bert_model_path
        self._bert_model_name=bert_model_name
        self._train_data=train_df
        self._test_size=test_df
        self._tokenizer = tokenizer

    def tokenize(self,text_array):
        ''' Returns tokenized IDs'''
#         all_tokens=torch.zeros((len(text_array),self._MAX_SEQUENCE_LENGTH))
        all_tokens=[]
        for i,text in enumerate(tqdm_notebook(text_array)):
            tokens = torch.tensor(
                            tokenizer.encode(
                                   text, 
                                   add_special_tokens=True,
                                   max_length=self._MAX_SEQUENCE_LENGTH,
                                   pad_to_max_length=True)) # commented to get rid of pad_to_max_length

            
#             all_tokens[i]=tokens
            all_tokens.append(tokens)
        return all_tokens

    def initialize_model_for_training(self,dataset_len,EPOCHS=1,model_seed=21000,lr=2e-5,batch_size=32,
                                      accumulation_steps=2):
        # Setup model parameters
        np.random.seed(model_seed)
        torch.manual_seed(model_seed)
        torch.cuda.manual_seed(model_seed)
        torch.backends.cudnn.deterministic = True

        # Empty cache
        torch.cuda.empty_cache()
        model = BertForSequenceClassification.from_pretrained(self._bert_model_path,cache_dir=None,num_labels=2)
        #"bert-base-uncased"
        model.zero_grad()
        model = model.to(device)
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        num_train_optimization_steps = int(EPOCHS*dataset_len/batch_size/accumulation_steps)
        optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
        scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=1000,num_training_steps=num_train_optimization_steps)  # PyTorch scheduler
#         model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
#         torch.nn.utils.clip_grad_norm_(optimizer_grouped_parameters,1.0)
        model=model.train()
        return model,optimizer,scheduler,EPOCHS

    def run_training(self,model,train_dataLoader,optimizer,scheduler,EPOCHS=1,batch_size=32,accumulation_steps=2):
        tq = tqdm_notebook(range(EPOCHS))
        for epoch in tq:
#             train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            avg_loss = 0.
            avg_accuracy = 0.
            lossf=None
            tk0 = tqdm_notebook(enumerate(train_dataLoader),total=len(train_dataLoader),leave=False)
            optimizer.zero_grad()   # Bug fix - thanks to @chinhuic
            for i,(x_batch, y_batch) in tk0:
#                 y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)  
                print(x_batch)
                print(i)
                print(y_batch)
                outputs = model(x_batch.to(device), labels=y_batch.to(device))
                loss, y_pred = outputs
#                 loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
#                 with amp.scale_loss(loss, optimizer) as scaled_loss:
#                     scaled_loss.backward()
                loss.backward()
                if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
                    optimizer.step()                            # Now we can do an optimizer step
                    scheduler.step()
                    optimizer.zero_grad()
                if lossf:
                    lossf = 0.98*lossf+0.02*loss.item()
                else:
                    lossf = loss.item()
                tk0.set_postfix(loss = lossf)
                avg_loss += loss.item() / len(train_dataLoader)
#                 avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:])>0.5) == (y_batch[:]>0.5).to(device)).to(torch.float) ).item()/len(train_loader)
            tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
            return model


In [26]:
class CustomBertConvModel(nn.Module):

    def __init__(self, bert_config, device, dropout_rate, n_class, out_channel=16):
    #def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, bert_config, device, dropout_rate, n_class, out_channel=16):

#     def __init__(self, bert_model_name, bert_model_path, tokenizer,
#                  device, dropout_rate, n_class, out_channel=16):
        """
        :param bert_config: str, BERT configuration description
        :param device: torch.device
        :param dropout_rate: float
        :param n_class: int
        :param out_channel: int, NOTE: out_channel per layer of BERT
        """

        super(CustomBertConvModel, self).__init__()
        
        self.bert_config = bert_config
        self.dropout_rate = dropout_rate
        self.n_class = n_class
        self.out_channel = out_channel
        self.bert = BertModel.from_pretrained(self.bert_config)
        self.out_channels = self.bert.config.num_hidden_layers*self.out_channel
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_config)
        self.conv = nn.Conv2d(in_channels=self.bert.config.num_hidden_layers,
                              out_channels=self.out_channels,
                              kernel_size=(3, self.bert.config.hidden_size),
                              groups=self.bert.config.num_hidden_layers)
        self.hidden_to_softmax = nn.Linear(self.out_channels, self.n_class, bias=True)
        self.dropout = nn.Dropout(p=self.dropout_rate)
        self.device = device

    def forward(self, tokens):
        """
        :param sents:
        :return:
        """
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        encoded_layers, pooled_output = self.bert(input_ids=input_ids, attention_mask=None,
                                                  output_all_encoded_layers=True)
        encoded_stack_layer = torch.stack(encoded_layers, 1)  # (batch_size, channel, max_sent_length, hidden_size)

        conv_out = self.conv(encoded_stack_layer)  # (batch_size, channel_out, some_length, 1)
        conv_out = torch.squeeze(conv_out, dim=3)  # (batch_size, channel_out, some_length)
        conv_out, _ = torch.max(conv_out, dim=2)  # (batch_size, channel_out)
        pre_softmax = self.hidden_to_softmax(conv_out)

        return pre_softmax


In [27]:
class CreateDataset(Dataset):
    def __init__(self,data,labels):
        self._dataset = [[train_data_tokenized[i],train_labels.values[i]] for i in range(0,len(train_data_tokenized))]
    
    def __len__(self):
        return len(self._dataset)

    def __getitem__(self,idx):
        return self._dataset[idx]


In [28]:
# Define constants and paths

seed = 7843
bert_model_name = "bert-base-uncased"
# Convert TF checkpoint to pytorch chckpoint and then use as input to class object
bert_model_path = "/Users/debalinamaiti/Documents/GraduateStudy/MIDS/W251/FinalProject/mids-w251-final-project/models/Tranformer_based/uncased_L-12_H-768_A-12/"
# data_path = "/Users/suhasgupta/w251/mids-w251-final-project/data/nlp.cs.princeton.edu/SARC/2.0/pol/"
data_path = "/Users/debalinamaiti/Documents/GraduateStudy/MIDS/W251/FinalProject/mids-w251-final-project/models/Tranformer_based/data/"
work_dir = "/Users/debalinamaiti/Documents/GraduateStudy/MIDS/W251/FinalProject/mids-w251-final-project/models/Tranformer_based/workingdir/"

tokenizer = BertTokenizer.from_pretrained(bert_model_name, cache_dir=None,do_lower_case=True)
max_seq_len = 64

# Load and check the dataset from files on disk 
train_file_name = data_path+'small_train.csv'
test_file_name  = data_path+'small_test.csv'
# train_file_name = data_path+'train_balanced_with_label.csv'
# test_file_name = data_path+'test_balanced_with_label.csv'

all_train_df = pd.read_csv(train_file_name)
test_df = pd.read_csv(test_file_name)

# Create a train, valid split
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_train_df, test_size=0.2,random_state=seed)

train_data   = train_df.text.fillna("DUMMY_VALUE")
train_labels = train_df.label
valid_data  = valid_df.text.fillna("DUMMY_VALUE")
valid_labels = valid_df.label
test_data  = test_df.text.fillna("DUMMY_VALUE")
test_labels = test_df.label

train_size,valid_size,test_size = len(train_df),len(valid_df),len(test_df)
print(train_size,valid_size,test_size)
#print(train_df.text)

799 200 999


In [29]:
# Create a model object 
bert_model1=Bert_Model(train_df=train_df,
                      test_df=test_df,
                      bert_model_name=bert_model_name,
                      bert_model_path=bert_model_path,
                      tokenizer=tokenizer,
                      max_seq_length=max_seq_len)

In [30]:
%timeit
train_data_tokenized = bert_model1.tokenize(train_data)
valid_data_tokenized = bert_model1.tokenize(valid_data)
test_data_tokenized = bert_model1.tokenize(test_data)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=799.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))




In [31]:
train_data_tokenized[0]
train_labels[0]

tensor([  101, 12008,  1010,  1996, 21321,  1997,  2025,  2108,  2583,  2000,
         2022,  1037,  6319,  1997,  3674,  2163,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])

1

In [32]:
train_dataset = CreateDataset(train_data_tokenized,train_labels)
valid_dataset = CreateDataset(valid_data_tokenized,valid_labels)
test_dateset = CreateDataset(valid_data_tokenized,valid_labels) # Added by D

In [33]:
x_train_dataset = [row[0] for row in train_dataset]
y_train_dataset = [row[1] for row in train_dataset]

In [34]:
train_dataset[0]
valid_dataset[0]
test_dateset[0]

[tensor([  101, 12008,  1010,  1996, 21321,  1997,  2025,  2108,  2583,  2000,
          2022,  1037,  6319,  1997,  3674,  2163,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 1]

[tensor([  101, 12008,  1010,  1996, 21321,  1997,  2025,  2108,  2583,  2000,
          2022,  1037,  6319,  1997,  3674,  2163,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 1]

[tensor([  101, 12008,  1010,  1996, 21321,  1997,  2025,  2108,  2583,  2000,
          2022,  1037,  6319,  1997,  3674,  2163,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 1]

In [35]:
len(train_dataset[0][0])

64

In [36]:
len(train_dataset)

799

In [37]:
params = {'batch_size': 32,
          'shuffle': False,
          'num_workers': 16}
max_epochs = 1
print("Expected number of batches:", int(len(train_data_tokenized)/params['batch_size']))
train_dataLoader = torch.utils.data.DataLoader(train_dataset,batch_size=params['batch_size'],
                                         shuffle=params['shuffle'],
                                         num_workers=params['num_workers'],
                                         pin_memory=False,drop_last=True)
print("Generated number of batches:%d" %len(train_dataLoader))

Expected number of batches: 24
Generated number of batches:24


#### Model Initialization and Training 

In [42]:
# Initialize the model
model,optimizer,scheduler,EPOCHS = bert_model1.initialize_model_for_training(len(train_dataLoader),EPOCHS=max_epochs)

In [43]:
# Train 
bert_model1.run_training(model,train_dataLoader,
                         optimizer=optimizer,scheduler=scheduler,EPOCHS=1)#max_epochs

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))

tensor([[  101, 12008,  1010,  ...,     0,     0,     0],
        [  101,  2065,  2417,  ...,     0,     0,     0],
        [  101,  2079,  2027,  ...,     0,     0,     0],
        ...,
        [  101, 10047,  2025,  ...,     0,     0,     0],
        [  101, 28117,  7377,  ...,     0,     0,     0],
        [  101,  2053,  2008,  ...,     0,     0,     0]])
0
tensor([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
        0, 1, 0, 0, 0, 1, 0, 1])
tensor([[  101,  2034,  3893,  ...,     0,     0,     0],
        [  101,  1996, 28293,  ...,     0,     0,     0],
        [  101,  8840,  2140,  ...,     0,     0,     0],
        ...,
        [  101,  2292,  1005,  ...,     0,     0,     0],
        [  101,  3398,  1010,  ...,     0,     0,     0],
        [  101,  4283,  2005,  ...,     0,     0,     0]])
1
tensor([0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
        1, 1, 1, 0, 0, 1, 0, 1])
tensor([[  101,  1045,  2228,  ...,     0,    

tensor([[ 101, 2002, 1005,  ...,    0,    0,    0],
        [ 101, 2012, 2560,  ...,    0,    0,    0],
        [ 101, 2821, 2157,  ...,    0,    0,    0],
        ...,
        [ 101, 2002, 2323,  ...,    0,    0,    0],
        [ 101, 2204, 2447,  ...,    0,    0,    0],
        [ 101, 2009, 1005,  ...,    0,    0,    0]])
18
tensor([1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
        1, 1, 1, 0, 1, 1, 0, 1])
tensor([[ 101, 2821, 3398,  ...,    0,    0,    0],
        [ 101, 2074, 3651,  ...,    0,    0,    0],
        [ 101, 2821, 1010,  ...,    0,    0,    0],
        ...,
        [ 101, 2092, 5580,  ...,    0,    0,    0],
        [ 101, 3398, 2040,  ...,    0,    0,    0],
        [ 101, 4931, 1010,  ...,    0,    0,    0]])
19
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        1, 0, 1, 1, 0, 1, 1, 0])
tensor([[  101,  5635,  2240,  ...,     0,     0,     0],
        [  101,  2009,  1005,  ...,     0,     0,     0],


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Custom Bert CNN Model

In [39]:
BertCnnModel1 = CustomBertConvModel("bert-base-uncased", 'cpu', 0.3, 2)


BertCnnModel1.train()




CustomBertConvModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [38]:
def generate_batch_data(x, y, batch_size):
    i, batch = 0, 0
    for batch, i in enumerate(range(0, len(x) - batch_size, batch_size), 1):
        x_batch = x[i : i + batch_size]
        y_batch = y[i : i + batch_size]
        yield x_batch, y_batch, batch
    if i + batch_size < len(x):
        yield x[i + batch_size :], y[i + batch_size :], batch + 1
    if batch == 0:
        yield x, y, 1

In [21]:
##### Hyper Tuning
# embed_num = len(train_dataset)
# embed_dim = len(train_dataset[0][0])
# class_num = 2
# kernel_num = 3
# kernel_sizes = [2, 3, 4]
# dropout = 0.5
# static = True

# lr = 0.001
# optimizer = torch.optim.Adam(BertCnnModel1.parameters(), lr=lr)
# loss_fn = nn.BCELoss()

## The following Training Part is not Working

In [155]:
train_losses, val_losses = [], []

for epoch in range(max_epochs):
    start_time = time.time()
    train_loss = 0

    BertCnnModel1.train(True)
#     for sample in train_data_tokenized:
#         y_pred = BertCnnModel1.forward(sample)
    for x_batch, y_batch, batch in generate_batch_data(train_data_tokenized, train_labels, params['batch_size']):
        print(x_batch)
        y_pred = BertCnnModel1(x_batch)
        optimizer.zero_grad()
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= batch
    train_losses.append(train_loss)
    elapsed = time.time() - start_time

    BertCnnModel1.eval() # disable dropout for deterministic output
    with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations
        val_loss, batch = 0, 1
        for x_batch, y_batch, batch in generate_batch_data(x_val, y_val, batch_size):
            y_pred = BertCnnModel1(x_batch)
            loss = loss_fn(y_pred, y_batch)
            val_loss += loss.item()
        val_loss /= batch
        val_losses.append(val_loss)

    print(
        "Epoch %d Train loss: %.2f. Validation loss: %.2f. Elapsed time: %.2fs."
        % (epoch + 1, train_losses[-1], val_losses[-1], elapsed)
    )

CustomBertConvModel(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bia

ValueError: Wrong shape for input_ids (shape torch.Size([64])) or attention_mask (shape torch.Size([64]))