In [0]:
!nvidia-smi
!pip install pytorch-nlp
!pip install pytorch-transformers




Tue Aug 13 11:57:28 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   27C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:


import numpy as np
import torch.nn as nn

from torchnlp.datasets import smt_dataset
#from torchnlp.datasets import imdb_dataset
#from torchnlp.datasets import trec_dataset
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
from pytorch_transformers import XLNetForSequenceClassification, XLNetTokenizer, AdamW, WarmupLinearSchedule
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

torch.set_default_tensor_type('torch.cuda.FloatTensor')



In [0]:
#DATA PREPROCESSING FUNCTIONS

#FUNCTION TO PAD DATA TO UNIFORM LENGTH
def padding(sequences, maxlen=None, dtype=np.int, padding='post', value=0.):
     
    # padding = "pre" pads from the front, "post" pads at the end
    # Function pads to convert lists of strings (of uneven length) into numpy arrays
  
    lengths = [len(s) for s in sequences]

    nb_samples = len(sequences)
    
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    padded = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if padding == 'post':
            padded[idx, :len(s)] = s
        elif padding == 'pre':
            padded[idx, -len(s):] = s
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return padded
  
#Function to pre-process data for BERT
def process_bert(data,num_train):
  
  #Import Bert tokenizer
  tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
  
  #Create arrays to store our tokenised training data
  tokens = []
  tokens_ids = []
  segment_ids = []
  input_masks = []
  
  #Tokenise the training data, add CLS/SEP tokens and then convert to IDs
  for i in range(num_train):

    #Tokenize text and add CLS/SEP tokens
    text = data[i]["text"]
    token = tokenizer.tokenize(text)
    token.insert(0,'[CLS]')
    token.append('[SEP]')

    #Create segment IDs and inputs masks
    segment_id = [0] * len(token)
    input_mask = [1] * len(token)
    segment_ids.append(segment_id)
    input_masks.append(input_mask)

    #Convert tokens to IDs
    token_id = tokenizer.convert_tokens_to_ids(token)
    tokens_ids.append(token_id)
  

  #Pad the training data so everything is of uniform length  
  tokens_ids = padding(tokens_ids)
  segment_ids = padding(segment_ids)
  input_masks = padding(input_masks)

  max_length = len(tokens_ids[0])


  #Convert training data and labels to tensors
  ids_tensor = torch.tensor([tokens_ids], dtype=torch.long).resize_((num_train,max_length))
  segment_ids_tensor = torch.tensor([segment_ids], dtype=torch.long).resize_((num_train,max_length))
  input_masks_tensor = torch.tensor([input_masks], dtype=torch.long).resize_((num_train,max_length))
  
  return ids_tensor, segment_ids_tensor, input_masks_tensor

#FUNCTION TO CREATE LABELS FOR SMT
def create_SMT_labels(data,num_train):
  labels_tensor = torch.empty(num_train, dtype=torch.long).random_(5)
  labels = data.__getitem__('label')
  labels = labels[0:num_train]
  
  very_pos_indices = [i for i, x in enumerate(labels) if x == "very positive"]
  pos_indices = [i for i, x in enumerate(labels) if x == "positive"]
  neut_indices = [i for i, x in enumerate(labels) if x == "neutral"]
  neg_indices = [i for i, x in enumerate(labels) if x == "negative"]
  very_neg_indices = [i for i, x in enumerate(labels) if x == "very negative"]

  labels_tensor[very_pos_indices] = 0
  labels_tensor[pos_indices] = 1
  labels_tensor[neut_indices] = 2
  labels_tensor[neg_indices] = 3
  labels_tensor[very_neg_indices] = 4
  
  return labels_tensor

#FUNCTION TO RESTRICT TRAINING DATASET
def restrict_data_func(train_ids_tensor_, train_segment_ids_tensor_, train_input_masks_tensor_, num_each_label, num_labels, train_labels_tensor_):
  train_ids_tensor = torch.empty((num_each_label*num_labels,max_length), dtype=torch.long).random_(num_labels)
  train_segment_ids_tensor = torch.empty((num_each_label*num_labels,max_length), dtype=torch.long).random_(num_labels)
  train_input_masks_tensor = torch.empty((num_each_label*num_labels,max_length), dtype=torch.long).random_(num_labels)
  train_labels_tensor = torch.empty(num_each_label*num_labels, dtype=torch.long).random_(num_labels)
  for i in range(num_labels):
  
    train_ids_tensor[i*num_each_label:(i+1)*num_each_label] = train_ids_tensor_[train_labels_tensor_==i][:num_each_label]
    train_segment_ids_tensor[i*num_each_label:(i+1)*num_each_label] = train_segment_ids_tensor_[train_labels_tensor_==i][:num_each_label]
    train_input_masks_tensor[i*num_each_label:(i+1)*num_each_label] = train_input_masks_tensor_[train_labels_tensor_==i][:num_each_label] 
 
  for i in range(num_labels):
    train_labels_tensor[i*num_each_label:(i+1)*num_each_label]=train_labels_tensor_[train_labels_tensor_==i][:num_each_label]

  return train_ids_tensor, train_segment_ids_tensor, train_input_masks_tensor, train_labels_tensor

def shuffle_tensors(tensor1,tensor2,tensor3,tensor4):
  torch.manual_seed(25082019)
  idx = torch.randperm(tensor1.shape[0])
  tensor1 = tensor1[idx]
  tensor2 = tensor2[idx]
  tensor3 = tensor3[idx]
  tensor4 = tensor4[idx]

  return tensor1, tensor2, tensor3, tensor4


#Import dataset
train = smt_dataset(train=True, fine_grained=True)
valid = smt_dataset(dev=True, fine_grained=True)
test = smt_dataset(test=True, fine_grained=True)

num_labels = 5
print(len(valid))
print(len(test))


trainDevTestTrees_PTB.zip: 795kB [00:02, 302kB/s]                           


1101
2210


In [0]:
#DATA PREP
num_train = 8540
num_valid = 1100
num_test = 2210
train_ids_tensor_, train_segment_ids_tensor_, train_input_masks_tensor_ = process_bert(train,num_train)
valid_ids_tensor, valid_segment_ids_tensor, valid_input_masks_tensor = process_bert(valid,num_valid)
test_ids_tensor, test_segment_ids_tensor, test_input_masks_tensor = process_bert(test,num_test)

train_labels_tensor_ = create_SMT_labels(train,num_train)
valid_labels_tensor = create_SMT_labels(valid,num_valid)
test_labels_tensor = create_SMT_labels(test,num_test)

max_length = train_ids_tensor_.shape[1]

print(max_length)
print(valid_ids_tensor.shape[1])
print(test_ids_tensor.shape[1])
#DECIDE HOW MANY OF EACH CLASS TO USE IN TRAINING DATA
restrict_training_data = 1
num_each_label=100

#Randomly shuffle training data
train_ids_tensor_, train_segment_ids_tensor_, train_input_masks_tensor_,train_labels_tensor_  = shuffle_tensors(train_ids_tensor_,train_segment_ids_tensor_, train_input_masks_tensor_, train_labels_tensor_)

if restrict_training_data:
  train_ids_tensor, train_segment_ids_tensor, train_input_masks_tensor, train_labels_tensor = restrict_data_func(train_ids_tensor_, train_segment_ids_tensor_, train_input_masks_tensor_, num_each_label, num_labels,train_labels_tensor_ )
else:
  train_ids_tensor, train_segment_ids_tensor, train_input_masks_tensor, train_labels_tensor = train_ids_tensor_, train_segment_ids_tensor_, train_input_masks_tensor_, train_labels_tensor_


train_max_batch = 20
valid_max_batch = 20
test_max_batch=17

#Load data into dataloader
train_data = TensorDataset(train_ids_tensor, train_segment_ids_tensor, train_input_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_max_batch)

valid_data = TensorDataset(valid_ids_tensor, valid_segment_ids_tensor, valid_input_masks_tensor, valid_labels_tensor)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=valid_max_batch)

test_data = TensorDataset(test_ids_tensor, test_segment_ids_tensor, test_input_masks_tensor, test_labels_tensor)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=test_max_batch)



100%|██████████| 798011/798011 [00:00<00:00, 859129.96B/s]


94
65
76


In [0]:
# import model for fine-tuning
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=num_labels)
model.cuda()

100%|██████████| 641/641 [00:00<00:00, 103457.45B/s]
100%|██████████| 467042463/467042463 [00:41<00:00, 11143234.77B/s]


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): XLNetLayerNorm()
          (dropout): Dropout(p=0.1)
        )
        (ff): XLNetFeedForward(
          (layer_norm): XLNetLayerNorm()
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1)
        )
        (dropout): Dropout(p=0.1)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): XLNetLayerNorm()
          (dropout): Dropout(p=0.1)
        )
        (ff): XLNetFeedForward(
          (layer_norm): XLNetLayerNorm()
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout

In [0]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)

In [0]:
#Training

#Set Loss Function
cross_entropy = nn.CrossEntropyLoss()
max_grad_norm = 1.0
# TRAIN loop
model.train()
tr_loss = 0
nb_tr_steps = 0
epochs = 15


for j in range(epochs):
  print("epoch: ", j+1)
  
  print("Begin Training")
  
  for step, batch in enumerate(train_dataloader):
      
      if(step%25==0):
        print("batch: ", step+1)

      
      b_train_ids, b_segment_ids, b_input_masks, b_train_labels = batch
      
   
      
      # forward pass
      outputs = model(input_ids=b_train_ids,token_type_ids=b_segment_ids,attention_mask=b_input_masks,labels=None)
      logits = outputs[0]
      loss = cross_entropy(logits,b_train_labels)
      
      # backward pass
      loss.backward()

      # track train loss
      tr_loss += loss.item()
      nb_tr_steps += 1

      # gradient clipping
      torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

      # update parameters
      optimizer.step()
      model.zero_grad()

      
  # print train loss per epoch
  print("Train loss: {}".format(tr_loss/nb_tr_steps))
  
  print("Begin Validation")
  model.eval()
  total_correct = 0
  for step, batch in enumerate(valid_dataloader):
    
    if(step%25==0):
      print("batch: ", step+1)
    
    b_valid_ids, b_segment_ids, b_input_masks, b_valid_labels = batch
    
    with torch.no_grad():
      outputs = model(input_ids=b_valid_ids, token_type_ids=b_segment_ids, attention_mask=b_input_masks)
      logits = outputs[0]
      
    a = logits.argmax(1)
    b = b_valid_labels

    temp_correct = torch.sum(a==b).cpu().numpy()
    total_correct+=temp_correct
  print("Validation Acccuracy:")
  print(total_correct/num_valid)
    
            
  

epoch:  1
Begin Training
batch:  1
Train loss: 1.642071442604065
Begin Validation
batch:  1
batch:  26
batch:  51
Validation Acccuracy:
0.27636363636363637
epoch:  2
Begin Training
batch:  1
Train loss: 1.5587976360321045
Begin Validation
batch:  1
batch:  26
batch:  51
Validation Acccuracy:
0.3845454545454545
epoch:  3
Begin Training
batch:  1
Train loss: 1.4499003267288209
Begin Validation
batch:  1
batch:  26
batch:  51
Validation Acccuracy:
0.44545454545454544
epoch:  4
Begin Training
batch:  1
Train loss: 1.3142207926511764
Begin Validation
batch:  1
batch:  26
batch:  51
Validation Acccuracy:
0.49454545454545457
epoch:  5
Begin Training
batch:  1
Train loss: 1.1628361217975616
Begin Validation
batch:  1
batch:  26
batch:  51
Validation Acccuracy:
0.46090909090909093
epoch:  6
Begin Training
batch:  1
Train loss: 1.0229063977797825
Begin Validation
batch:  1
batch:  26
batch:  51
Validation Acccuracy:
0.46636363636363637
epoch:  7
Begin Training
batch:  1
Train loss: 0.89265192016

In [0]:
model.eval()
total_correct = 0
for step, batch in enumerate(test_dataloader):

  if(step%25==0):
    print("batch: ", step+1)

  b_valid_ids, b_segment_ids, b_input_masks, b_valid_labels = batch

  with torch.no_grad():
    outputs = model(input_ids=b_valid_ids, token_type_ids=b_segment_ids, attention_mask=b_input_masks)
    logits = outputs[0]

  a = logits.argmax(1)
  b = b_valid_labels

  temp_correct = torch.sum(a==b).cpu().numpy()
  total_correct+=temp_correct
print("Test Acccuracy:")
print(total_correct/num_test)

batch:  1
batch:  26
batch:  51
batch:  76
batch:  101
batch:  126
Test Acccuracy:
0.4665158371040724
