In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
%cd drive/My Drive/NLP
!pip install transformers

/content/drive/My Drive/NLP
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/13/33/ffb67897a6985a7b7d8e5e7878c3628678f553634bd3836404fef06ef19b/transformers-2.5.1-py3-none-any.whl (499kB)
[K     |████████████████████████████████| 501kB 2.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 13.9MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 19.1MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux

In [3]:
import gc
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import *

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There is/are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible. Somehow this isn't working!
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

There is/are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [0]:
TRAIN_DATAPATH = 'data/processed_train.npy'
MODELS = [#(BertModel,                           BertTokenizer,       'bert-base-uncased'),
          #(BertForSequenceClassification,       BertTokenizer,       'bert-base-uncased'),
          #(OpenAIGPTModel,                      OpenAIGPTTokenizer,  'openai-gpt'),
          #(GPT2Model,                           GPT2Tokenizer,       'gpt2'),
          #(CTRLModel,                           CTRLTokenizer,       'ctrl'),
          #(TransfoXLModel,                      TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,                          XLNetTokenizer,      'xlnet-large-cased'),
          #(XLNetForSequenceClassification,      XLNetTokenizer,      'xlnet-base-cased'),
          #(XLMModel,                            XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(XLMForSequenceClassification,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(RobertaModel,                        RobertaTokenizer,    'roberta-base'),
          #(RobertaForSequenceClassification,    RobertaTokenizer,    'roberta-base'),
          #(XLMRobertaModel,                     XLMRobertaTokenizer, 'xlm-roberta-base'),
          #(XLMRobertaForSequenceClassification, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]

In [0]:
class DiscourseDataset(Dataset):
  def __init__(self, corpus, tokenizer_class, pretrained_weights):
    self.corpus = corpus.reset_index()
    self.corpus['label'] = self.corpus['label'].astype(int)
    self.corpus['sentence'].dropna(inplace=True)
    self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    self.weights = torch.tensor(self.corpus['label'].value_counts(normalize=True).tolist()).to(device)

  def __len__(self):
    return(len(self.corpus))

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    sent = self.corpus['sentence'][idx]
    args = self.corpus['args'][idx]
    enc_sent = torch.tensor(self.tokenizer.encode(sent, add_special_tokens=True, max_length=128)).to(device)
    enc_sent = F.pad(enc_sent, (0, 128 - enc_sent.shape[0])).type(torch.LongTensor).to(device)
    enc_args = [self.tokenizer.encode(a, add_special_tokens=False, max_length=32) for a in args]
    for enc_arg in enc_args:
      while len(enc_arg) < 32:
        enc_arg.append(0)
    # print(enc_args)
    enc_args = torch.tensor(enc_args, dtype=torch.long).to(device)
    # print(enc_args)
    enc_a = torch.zeros((8, 32), dtype=torch.long).to(device)
    if enc_args.shape[0] <= 8:
      enc_a[0:enc_args.shape[0], :] = enc_args
    else:
      enc_a = enc_args[0:8, :]
    label = torch.tensor(self.corpus['label'][idx], dtype=torch.long).to(device)
    return (enc_sent, enc_a, label)

In [0]:
class CustomModel(nn.Module):
  def __init__(self, model_class, pretrained_weights):
    super(CustomModel, self).__init__()
    self.transformer = model_class.from_pretrained(pretrained_weights, output_hidden_states=False, output_attentions=False)
    self.gru = nn.GRU(1024, 1024, batch_first=True, bidirectional=True)
    self.dropout = nn.Dropout(0.3)
    # self.lin_1 = nn.Linear(in_features=768, out_features=64)
    self.lin = nn.Linear(in_features=1024, out_features=2)

  def forward(self, x, args):
    h = self.transformer(x)[0]
    h = torch.sum(h, dim=1) / 128 #index_select(h, dim=1, index=torch.tensor(127).to(device))
    h = h.repeat(2, 1, 1)
    arg_seq = torch.empty(x.shape[0], 8, 1024).to(device)
    for i in range(args.shape[0]):
      arg = torch.index_select(args, dim=0, index=torch.tensor(i).to(device)).squeeze()
      a = self.transformer(arg)[0]
      a = torch.sum(a, dim=1) / 32
      arg_seq[i, :, :] = a
    _, h = self.gru(arg_seq, h)
    print(h.shape)
    h = torch.sum(h, dim=1)
    x = self.dropout(h)
    x = self.lin(x)
    return x

In [0]:
def set_worker_seed(worker_id):
  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

In [0]:
model_class, tokenizer_class, pretrained_weights = MODELS[0]

# Loading the data and splitting it

master_corpus = np.load(TRAIN_DATAPATH, allow_pickle=True)
master_corpus = pd.DataFrame(list(master_corpus))
# master_corpus['arg_len'] = [len(arg) for arg in master_corpus['args']]
# print(lsorted(master_corpus['arg_len'], reverse=True))

train_corpus, test_corpus = train_test_split(master_corpus, random_state=seed_val, stratify=master_corpus['label'])

train_dataset = DiscourseDataset(train_corpus, tokenizer_class, pretrained_weights)
test_dataset = DiscourseDataset(test_corpus, tokenizer_class, pretrained_weights)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, worker_init_fn=set_worker_seed)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=True, worker_init_fn=set_worker_seed)

In [0]:
for obj in gc.get_objects():
  try:
    if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
      del obj
  except:
    pass

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
print('Memory Usage:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
model = CustomModel(model_class, pretrained_weights).to(device)
criterion = nn.CrossEntropyLoss(weight=train_dataset.weights)

""" For XLNet 
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
  ]
  # This variable contains all of the hyperparemeter information our training loop needs
  optimizer = AdamW(optimizer_grouped_parameters,
                    lr=2e-5)
"""

""" For BERT """
optimizer = AdamW(model.parameters(),
                    lr = 1e-5, # args.learning_rate - default is 5e-5, best is 1e-5 so far
                    eps = 1e-8) # args.adam_epsilon  - default is 1e-8.


  # Number of training epochs (authors recommend between 2 and 4)
epochs = 20
# Total number of training steps is number of batches * number of epochs.
total_train_steps = len(train_loader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_train_steps)

for epoch in range(epochs):
  running_loss = 0.0
  total_loss = 0.0
  model.train()

  train_preds = None
  train_labels = None

  for i, data in enumerate(train_loader):
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    optimizer.zero_grad()
    
    enc_sents, enc_args, labels = data
    outputs = model(enc_sents, enc_args)
    loss = criterion(outputs, labels)
      
    running_loss += loss.item()
    total_loss += loss.item()


    if train_preds is None or train_labels is None:
      train_preds = np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()
      train_labels = labels.cpu().numpy().flatten()
    else:
      train_preds = np.concatenate((train_preds, np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()))
      train_labels = np.concatenate((train_labels, labels.cpu().numpy().flatten()))

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    loss.backward()
    optimizer.step()
    scheduler.step()

    if i % 100 == 99:    # print every 100 mini-batches
      print('[%d, %5d] loss: %.5f' % (epoch + 1, i + 1, running_loss / 100))
      running_loss = 0.0
    
  print("Training loss in epoch %d is %.5f" % (epoch + 1, total_loss / len(train_loader)))
  print("Training accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(train_labels, train_preds) * 100))
  print("Training precision in epoch %d is %.5f" % (epoch + 1, precision_score(train_labels, train_preds) * 100))
  print("Training recall in epoch %d is %.5f" % (epoch + 1, recall_score(train_labels, train_preds) * 100))
  print("Training F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(train_labels, train_preds) * 100))

  # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
  model.eval()
  # Tracking variables 
  test_loss = 0.0

  test_preds = None
  test_labels = None

  with torch.no_grad():
    for data in test_loader:
      enc_sents, enc_args, labels = data
      outputs = model(enc_sents, enc_args)
      loss = criterion(outputs, labels)
      test_loss += loss.item()
      if test_preds is None or test_labels is None:
        test_preds = np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()
        test_labels = labels.cpu().numpy().flatten()
      else:
        test_preds = np.concatenate((test_preds, np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()))
        test_labels = np.concatenate((test_labels, labels.cpu().numpy().flatten()))

  print("Test loss in epoch %d is %.5f" % (epoch + 1, test_loss / len(test_loader)))
  print("Test accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(test_labels, test_preds) * 100))
  print("Test precision in epoch %d is %.5f" % (epoch + 1, precision_score(test_labels, test_preds) * 100))
  print("Test recall in epoch %d is %.5f" % (epoch + 1, recall_score(test_labels, test_preds) * 100))
  print("Test F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(test_labels, test_preds) * 100))

Memory Usage:
Allocated: 15.2 GB
Cached:    15.2 GB


RuntimeError: ignored