In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install transformers
%cd drive/My\ Drive/NLP

In [0]:
from keras.preprocessing.sequence import pad_sequences
import nltk
from nltk import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import *

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There is/are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible. Somehow this isn't working!
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.benchmark = False

In [0]:
MODELS = [#(BertModel,                           BertTokenizer,       'bert-large-uncased'),
          #(BertForSequenceClassification,       BertTokenizer,       'bert-base-uncased'),
          #(OpenAIGPTMode/l,                      OpenAIGPTTokenizer,  'openai-gpt'),
          #(GPT2Model,                           GPT2Tokenizer,       'gpt2'),
          #(CTRLModel,                           CTRLTokenizer,       'ctrl'),
          #(TransfoXLModel,                      TransfoXLTokenizer,  'transfo-xl-wt103'),
          #(XLNetModel,                          XLNetTokenizer,      'xlnet-base-cased'),
          #(XLNetForSequenceClassification,      XLNetTokenizer,      'xlnet-base-cased'),
          #(XLMModel,                            XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(XLMForSequenceClassification,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (RobertaModel,                        RobertaTokenizer,    'roberta-large'),
          #(RobertaForSequenceClassification,    RobertaTokenizer,    'roberta-base'),
          #(XLMRobertaModel,                     XLMRobertaTokenizer, 'xlm-roberta-base'),
          #(XLMRobertaForSequenceClassification, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]
FIRST_DATAPATH = "data/train_1.csv"
SECOND_DATAPATH = "data/train_2.csv"

# For the first sub-task

In [0]:
class ClassificationDataset(Dataset):
  def __init__(self, corpus, tokenizer_class, pretrained_weights, pos_vectorizer, ngram_vectorizer, max_len):
    self.corpus = corpus.reset_index()
    self.corpus['sentence'].dropna(inplace=True)
    self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    self.corpus['enc_sentence'] = [self.tokenizer.encode(sent, add_special_tokens=True, max_length=max_len) for sent in self.corpus['sentence']]
    self.corpus['enc_sentence'] = pad_sequences(self.corpus['enc_sentence'], padding='post').tolist()
    self.weights = torch.tensor(self.corpus['gold_label'].value_counts(normalize=True).tolist()).to(device)
    self.pos_vectorizer = pos_vectorizer
    self.ngram_vectorizer = ngram_vectorizer

  def __len__(self):
    return len(self.corpus)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    S = self.corpus['sentence'][idx]
    X = torch.tensor(self.corpus['enc_sentence'][idx]).to(device)
    y = torch.tensor(self.corpus['gold_label'][idx]).to(device)
    P = torch.tensor(self.pos_vectorizer.transform([self.corpus['pos_string'][idx]]).toarray(), dtype=torch.float32).flatten().to(device)
    N = torch.tensor(self.ngram_vectorizer.transform([S]).toarray(), dtype=torch.float32).flatten().to(device)
    sample = (X, y, P, N, S)
    return sample

In [0]:
class CustomModel(nn.Module):
  def __init__(self, model):
    super(CustomModel, self).__init__()
    self.transformer = model
    """self.conv_1 = nn.Conv2d(1, 4, 3, padding_mode='same')
    self.pool_1 = nn.MaxPool2d(2, stride=2)
    self.conv_2 = nn.Conv2d(4, 1, 3, padding_mode='same')
    self.pool_2 = nn.MaxPool2d(2, stride=2)
    self.lin_1 = nn.Linear(in_features=5700, out_features=64)
    self.lin_2 = nn.Linear(in_features=64, out_features=2)"""
    self.dropout_1 = nn.Dropout(0.3)
    self.dropout_2 = nn.Dropout(0.3)
    self.lin_1 = nn.Linear(in_features=3024, out_features=512)
    self.lin_2 = nn.Linear(in_features=512, out_features=64)
    self.lin_3 = nn.Linear(in_features=64, out_features=2)

  def forward(self, x, p, n):
    x = self.transformer(x)
    """
    x = x[0].unsqueeze(1)
    #print(x.shape)
    x = F.relu(self.conv_1(x))
    x = self.pool_1(x)
    x = F.relu(self.conv_2(x))
    x = self.pool_2(x)
    x = x.squeeze()
    x = x.flatten(start_dim=1)
    # x = self.dropout_1(x)
    x = F.relu(self.lin_1(x))
    x = self.dropout_2(x)"""
    x = torch.cat((p, n, x[1]), dim=-1)
    x = self.lin_1(x)
    x = F.relu(x)
    x = self.dropout_1(x)
    x = self.lin_2(x)
    x = F.relu(x)
    x = self.dropout_2(x)
    x = self.lin_3(x)
    return x

In [0]:
def set_worker_seed(worker_id):
  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)

In [0]:
model_class, tokenizer_class, pretrained_weights = MODELS[0]

# Loading the data and splitting it
master_corpus = pd.read_csv(FIRST_DATAPATH, encoding='utf-8')
# master_corpus['pred_label'] = master_corpus['gold_label']
# submission = master_corpus[['sentenceID', 'pred_label']]
# submission.to_csv('subtask1.csv', index=False)

master_corpus['pos_list'] = [[pairs[1] for pairs in pos_tag(word_tokenize(sent))] for sent in master_corpus['sentence']]
master_corpus['pos_string'] = [" ".join(pos_list) for pos_list in master_corpus['pos_list']]
pos_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=1000)
pos_vectorizer.fit(master_corpus['pos_string'])
ngram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=1000)
ngram_vectorizer.fit(master_corpus['sentence'])

# train_corpus, test_corpus = train_test_split(master_corpus, random_state=seed_val, stratify=master_corpus['gold_label'])
train_corpus = pd.read_csv('train_train_1.csv', encoding='utf-8')
test_corpus = pd.read_csv('train_va;_1.csv', encoding='utf-8')

train_dataset = ClassificationDataset(train_corpus, tokenizer_class, pretrained_weights, pos_vectorizer, ngram_vectorizer, max_len=128)
test_dataset = ClassificationDataset(test_corpus, tokenizer_class, pretrained_weights, pos_vectorizer, ngram_vectorizer, max_len=128)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, worker_init_fn=set_worker_seed)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True, worker_init_fn=set_worker_seed)

base_model = model_class.from_pretrained(pretrained_weights, output_hidden_states=False, output_attentions=False)

In [0]:
model = CustomModel(base_model)
model.cuda()
criterion = nn.CrossEntropyLoss(weight=train_dataset.weights)

""" For XLNet 
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
  ]
  # This variable contains all of the hyperparemeter information our training loop needs
  optimizer = AdamW(optimizer_grouped_parameters,
                    lr=2e-5)
"""

""" For BERT """
optimizer = AdamW(model.parameters(),
                    lr = 1e-5, # args.learning_rate - default is 5e-5, best is 1e-5 so far
                    eps = 1e-8) # args.adam_epsilon  - default is 1e-8.


  # Number of training epochs (authors recommend between 2 and 4)
epochs = 20
# Total number of training steps is number of batches * number of epochs.
total_train_steps = len(train_loader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_train_steps)

for epoch in range(epochs):
  running_loss = 0.0
  total_loss = 0.0
  model.train()

  train_preds = None
  train_labels = None

  for i, data in enumerate(train_loader):
    optimizer.zero_grad()
      
    inputs, labels, pos_feats, ngram_feats, sentences = data
    outputs = model(inputs, pos_feats, ngram_feats)
    loss = criterion(outputs, labels)
      
    running_loss += loss.item()
    total_loss += loss.item()

    if train_preds is None or train_labels is None:
      train_preds = np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()
      train_labels = labels.cpu().numpy().flatten()
    else:
      train_preds = np.concatenate((train_preds, np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()))
      train_labels = np.concatenate((train_labels, labels.cpu().numpy().flatten()))

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    loss.backward()
    optimizer.step()
    scheduler.step()

    if i % 100 == 99:    # print every 100 mini-batches
      print('[%d, %5d] loss: %.5f' % (epoch + 1, i + 1, running_loss / 100))
      running_loss = 0.0
    
  print("Training loss in epoch %d is %.5f" % (epoch + 1, total_loss / len(train_loader)))
  print("Training accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(train_labels, train_preds) * 100))
  print("Training precision in epoch %d is %.5f" % (epoch + 1, precision_score(train_labels, train_preds) * 100))
  print("Training recall in epoch %d is %.5f" % (epoch + 1, recall_score(train_labels, train_preds) * 100))
  print("Training F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(train_labels, train_preds) * 100))

  # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
  model.eval()
  # Tracking variables 
  test_loss = 0.0

  test_preds = None
  test_labels = None

  with torch.no_grad():
    for data in test_loader:
      inputs, labels, pos_feats, ngram_feats, sentences = data
      outputs = model(inputs, pos_feats, ngram_feats)
      loss = criterion(outputs, labels)
      test_loss += loss.item()
      if test_preds is None or test_labels is None:
        test_preds = np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()
        test_labels = labels.cpu().numpy().flatten()
      else:
        test_preds = np.concatenate((test_preds, np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()))
        test_labels = np.concatenate((test_labels, labels.cpu().numpy().flatten()))

  print("Test loss in epoch %d is %.5f" % (epoch + 1, test_loss / len(test_loader)))
  print("Test accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(test_labels, test_preds) * 100))
  print("Test precision in epoch %d is %.5f" % (epoch + 1, precision_score(test_labels, test_preds) * 100))
  print("Test recall in epoch %d is %.5f" % (epoch + 1, recall_score(test_labels, test_preds) * 100))
  print("Test F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(test_labels, test_preds) * 100))

In [0]:
torch.save(model, "roberta_pos_ngram.pth")