In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install transformers
%cd drive/My\ Drive/NLP

In [2]:
from transformers import *
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import tensorflow as tf
import torch.nn.functional as F
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchsummary import summary
from keras.preprocessing.sequence import pad_sequences

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There is/are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Set the seed value all over the place to make this reproducible. Somehow this isn't working!
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

There is/are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


Using TensorFlow backend.


In [0]:
MODELS = [#(BertModel,                           BertTokenizer,       'bert-base-uncased'),
          #(BertForSequenceClassification,       BertTokenizer,       'bert-large-uncased'),
          #(OpenAIGPTModel,                      OpenAIGPTTokenizer,  'openai-gpt'),
          #(GPT2Model,                           GPT2Tokenizer,       'gpt2'),
          #(CTRLModel,                           CTRLTokenizer,       'ctrl'),
          #(TransfoXLModel,                      TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,                          XLNetTokenizer,      'xlnet-base-cased'),
          #(XLNetForSequenceClassification,      XLNetTokenizer,      'xlnet-large-cased'),
          #(XLMModel,                            XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(XLMForSequenceClassification,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          #(RobertaModel,                        RobertaTokenizer,    'roberta-base'),
          #(RobertaForSequenceClassification,    RobertaTokenizer,    'roberta-large'),
          #(XLMRobertaModel,                     XLMRobertaTokenizer, 'xlm-roberta-base'),
          #(XLMRobertaForSequenceClassification, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]
FIRST_DATAPATH = "data/train_1.csv"
SECOND_DATAPATH = "data/train_2.csv"

In [4]:
ls data/Subtask-1-master/train.csv

data/Subtask-1-master/train.csv


# For the first sub-task

In [0]:
class ClassificationDataset(Dataset):
  def __init__(self, corpus, tokenizer_class, pretrained_weights, max_len):
    self.corpus = corpus.reset_index()
    self.corpus['sentence'].dropna(inplace=True)
    self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)# , do_lower_case=True)
    self.corpus['sentence'] = [self.tokenizer.encode(sent, add_special_tokens=True, max_length=max_len) for sent in self.corpus['sentence']]
    self.corpus['sentence'] = pad_sequences(self.corpus['sentence'], padding='post').tolist()
    self.weights = torch.tensor(self.corpus['gold_label'].value_counts(normalize=True).tolist()).to(device)
    print(self.corpus['gold_label'].value_counts(normalize=True))

  def __len__(self):
    return len(self.corpus)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    # print(type(self.corpus['sentence'][idx]))
    X = torch.tensor(self.corpus['sentence'][idx]).to(device)
    y = torch.tensor(self.corpus['gold_label'][idx]).to(device)
    sample = (X, y)
    return sample

In [0]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds.detach().cpu().numpy(), axis=1).flatten()
    labels_flat = labels.cpu().numpy().flatten()

    # print(pred_flat, labels_flat)
    return np.sum(pred_flat == labels_flat)

In [0]:
# master_corpus = pd.read_csv(FIRST_DATAPATH, encoding='utf-8')
# train_corpus, test_corpus = train_test_split(master_corpus, random_state=seed_val, stratify=master_corpus['gold_label'])
train_corpus = pd.read_csv("train_train_1.csv", encoding='utf-8')
test_corpus = pd.read_csv("train_val_1.csv", encoding='utf-8')
#aster_corpus = pd.read_csv("data/Subtask-1-master/train.csv", encoding = 'utf-8')

In [0]:
class CustomModel(nn.Module):
  def __init__(self, model_class, pretrained_weights):
    super(CustomModel, self).__init__()
    self.trans = model_class.from_pretrained(pretrained_weights, output_hidden_states=False, output_attentions=False)
    Ci = 1
    Ks = [3,4,5]
    Co = 100
    self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co,(K, 768)) for K in Ks])   
    self.dropout = nn.Dropout(0.5)
    self.fc1 = nn.Linear(len(Ks)*Co, 2) 

  def forward(self, x):

    x = self.trans(x)[0]
    # print(x.size())

    x = x.unsqueeze(1)  # not sure about this

    x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
    x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
    x = torch.cat(x, 1)
    x = self.dropout(x)  # (N, len(Ks)*Co)
    logit = self.fc1(x)

    return logit

In [11]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
  # Loading the data and splitting it
  # master_corpus = master_corpus
  train_corpus, test_corpus = train_corpus, test_corpus

  train_dataset = ClassificationDataset(train_corpus, tokenizer_class, pretrained_weights, max_len=128)
  test_dataset = ClassificationDataset(test_corpus, tokenizer_class, pretrained_weights, max_len=128)
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
  test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

  # model= model_class.from_pretrained(pretrained_weights, num_labels=2, output_hidden_states=False, output_attentions=False)
  model = CustomModel(model_class, pretrained_weights)
  model.to(device)
  criterion = nn.CrossEntropyLoss(weight=train_dataset.weights)

  # Number of training epochs (authors recommend between 2 and 4)
  epochs = 10

  """For XLNet"""
  """
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
  ]
  # This variable contains all of the hyperparemeter information our training loop needs
  optimizer = AdamW(optimizer_grouped_parameters,
                    lr=1e-5)
"""
  

  """ For BERT """
  optimizer = AdamW(model.parameters(),
                    lr = 1e-5, # args.learning_rate - default is 5e-5, 1e-5 worked best for me
                    eps = 1e-8) # args.adam_epsilon  - default is 1e-8.

  # Total number of training steps is number of batches * number of epochs.
  total_train_steps = len(train_loader) * epochs
  # Create the learning rate scheduler.
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0, # Default value in run_glue.py
                                              num_training_steps = total_train_steps)
  
  for epoch in range(epochs):
    running_loss = 0.0
    total_loss = 0.0
    model.train()

    train_preds = None
    train_labels = None

    for i, data in enumerate(train_loader):
      inputs, labels = data
      optimizer.zero_grad()
      outputs = model(inputs) # labels=b_labels)
      # print(outputs.size())
      loss = criterion(outputs, labels)
      
      running_loss += loss.item()
      total_loss += loss.item()
      # train_accuracy += flat_accuracy(outputs[0], labels)

      if train_preds is None or train_labels is None:
        train_preds = np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()
        train_labels = labels.cpu().numpy().flatten()
      else:
        train_preds = np.concatenate((train_preds, np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()))
        train_labels = np.concatenate((train_labels, labels.cpu().numpy().flatten()))

      # Clip the norm of the gradients to 1.0.
      # This is to help prevent the "exploding gradients" problem.
      nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      loss.backward()
      optimizer.step()
      scheduler.step()

      if i % 100 == 99:    # print every 100 mini-batches
        print('[%d, %5d] loss: %.5f' % (epoch + 1, i + 1, running_loss / 100))
        running_loss = 0.0
    
    print("Training loss in epoch %d is %.5f" % (epoch + 1, total_loss / len(train_loader)))
    print("Training accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(train_labels, train_preds) * 100))
    print("Training precision in epoch %d is %.5f" % (epoch + 1, precision_score(train_labels, train_preds) * 100))
    print("Training recall in epoch %d is %.5f" % (epoch + 1, recall_score(train_labels, train_preds) * 100))
    print("Training F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(train_labels, train_preds) * 100))

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    test_loss = 0.0

    test_preds = None
    test_labels = None

    with torch.no_grad():
      for data in test_loader:
        inputs, labels = data
        outputs = model(inputs) # labels=b_labels)
        loss = criterion(outputs, labels)
      
        test_loss += loss.item()
        if test_preds is None or test_labels is None:
          test_preds = np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()
          test_labels = labels.cpu().numpy().flatten()
        else:
          test_preds = np.concatenate((test_preds, np.argmax(outputs.detach().cpu().numpy(), axis=1).flatten()))
          test_labels = np.concatenate((test_labels, labels.cpu().numpy().flatten()))

        # test_accuracy += flat_accuracy(outputs[0], labels)
    print(test_preds)
    print("Test loss in epoch %d is %.5f" % (epoch + 1, test_loss / len(test_loader)))
    print("Test accuracy in epoch %d is %.5f" % (epoch + 1, accuracy_score(test_labels, test_preds) * 100))
    print("Test precision in epoch %d is %.5f" % (epoch + 1, precision_score(test_labels, test_preds) * 100))
    print("Test recall in epoch %d is %.5f" % (epoch + 1, recall_score(test_labels, test_preds) * 100))
    print("Test F1-score in epoch %d is %.5f" % (epoch + 1, f1_score(test_labels, test_preds) * 100))

0    0.888103
1    0.111897
Name: gold_label, dtype: float64
0    0.888308
1    0.111692
Name: gold_label, dtype: float64


HBox(children=(IntProgress(value=0, description='Downloading', max=690, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=467042463, style=ProgressStyle(description_…


[1,   100] loss: 0.09160
[1,   200] loss: 0.07096
[1,   300] loss: 0.04701
Training loss in epoch 1 is 0.06939
Training accuracy in epoch 1 is 89.45641
Training precision in epoch 1 is 95.65217
Training recall in epoch 1 is 6.04950
Training F1-score in epoch 1 is 11.37931
[0 0 0 ... 0 0 0]
Test loss in epoch 1 is 0.03546
Test accuracy in epoch 1 is 94.15385
Test precision in epoch 1 is 95.28796
Test recall in epoch 1 is 50.13774
Test F1-score in epoch 1 is 65.70397
[2,   100] loss: 0.03460
[2,   200] loss: 0.03629
[2,   300] loss: 0.03003
Training loss in epoch 2 is 0.03352
Training accuracy in epoch 2 is 94.67692
Training precision in epoch 2 is 96.12903
Training recall in epoch 2 is 54.62878
Training F1-score in epoch 2 is 69.66686
[0 1 0 ... 0 0 0]
Test loss in epoch 2 is 0.03017
Test accuracy in epoch 2 is 95.87692
Test precision in epoch 2 is 95.25692
Test recall in epoch 2 is 66.39118
Test F1-score in epoch 2 is 78.24675
[3,   100] loss: 0.02096
[3,   200] loss: 0.02273
[3,   30

In [0]:
a = torch.

[0m[01;34mcnn-text-classification-pytorch[0m/         [01;34msent-conv-torch[0m/   wat_cosst.md
[01;34mdata[0m/                                    train_train_1.csv
[01;34mEMNLP_2018_Causal_Explanation_Analysis[0m/  train_val_1.csv


In [13]:
torch.save(model, "xlnet_cnn.pth")

  "type " + obj.__name__ + ". It won't be checked "


In [0]:
class FilteringDataset(Dataset):
  def __init__(self, corpus, tokenizer_class, pretrained_weights, max_len):
    self.corpus = corpus.reset_index()
    self.corpus['sentence'].dropna(inplace=True)
    self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)# , do_lower_case=True)
    self.max_len = max_len
    # self.corpus['sentence'] = [self.tokenizer.encode(sent, add_special_tokens=True, max_length=max_len) for sent in self.corpus['sentence']]
    # self.corpus['sentence'] = pad_sequences(self.corpus['sentence'], padding='post').tolist()
    self.weights = torch.tensor(self.corpus['gold_label'].value_counts(normalize=True).tolist()).to(device)
    print(self.corpus['gold_label'].value_counts(normalize=True))

  def __len__(self):
    return len(self.corpus)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    # print(type(self.corpus['sentence'][idx]))
    sentence = self.corpus['sentence'][idx]
    tok_seq = self.tokenizer.encode(sentence, add_special_tokens=True, max_length=self.max_len)
    while len(tok_seq) < self.max_len:
      tok_seq.append(0)
    X = torch.tensor(tok_seq).to(device)
    y = torch.tensor(self.corpus['gold_label'][idx]).to(device)
    sample = (X, y, sentence)
    return sample

NameError: ignored

In [0]:
!nvidia-smi -r

GPU Reset couldn't run because it failed to allocate group of reset devices : Uninitialized


In [0]:
for model_class, tokenizer_class, pretrained_weights in MODELS:
  # Loading the data and splitting it
  master_corpus = pd.read_csv("data/train_1.csv")
  master_dataset = FilteringDataset(master_corpus, tokenizer_class, pretrained_weights, max_len=128)
  master_loader = torch.utils.data.DataLoader(master_dataset, batch_size=1, shuffle=True)
  criterion = nn.CrossEntropyLoss(weight=master_dataset.weights)


  model = model_class.from_pretrained('bert-base')
  model.cuda()
  
  FP = []
  FN = []

  test_loss = 0
  test_preds = None
  test_labels = None
  for i, data in enumerate(master_loader):
    inp, labels, sent = data
    outputs = model(inp) # labels=b_labels)
    loss = criterion(outputs[0], labels)
      
    test_loss += loss.item()
    preds = np.argmax(outputs[0].detach().cpu().numpy(), axis=1).flatten()
    labels = labels.cpu().numpy().flatten()
    if test_preds is None or test_labels is None:
      test_preds = preds.copy()
      test_labels = labels.copy()
    else:
      test_preds = np.concatenate((test_preds, preds))
      test_labels = np.concatenate((test_labels, labels))

    for i in range(preds.shape[0]):
      if preds[i].item() is 0 and labels[i].item() is 1:
        FN.append(sent[i])
      elif preds[i].item() is 1 and labels[i].item() is 0:
        FP.append(sent[i])

  print("Test loss in epoch %d is %.5f" % (1, test_loss / len(master_loader)))
  print("Test accuracy in epoch %d is %.5f" % (1, accuracy_score(test_labels, test_preds) * 100))
  print("Test precision in epoch %d is %.5f" % (1, precision_score(test_labels, test_preds) * 100))
  print("Test recall in epoch %d is %.5f" % (1, recall_score(test_labels, test_preds) * 100))
  print("Test F1-score in epoch %d is %.5f" % (1, f1_score(test_labels, test_preds) * 100))

  for i in FP:
    print(i)
  print()
  for i in FN:
    print(i)

0    0.888154
1    0.111846
Name: gold_label, dtype: float64
Test loss in epoch 1 is 0.04509
Test accuracy in epoch 1 is 99.12308
Test precision in epoch 1 is 96.65738
Test recall in epoch 1 is 95.46080
Test F1-score in epoch 1 is 96.05536
As at DocX, the assignments were signed years after the mortgages should have been transferred to the investment trusts.
But it should have been clear that rules allowing retaliation against alleged currency manipulators, which would almost certainly fall foul of World Trade Organisation law, were a non-starter.
The 10 operations I had as a child and adolescent left scars on my ankles, knees, hips and shoulders that remind me daily of what was once there that should not have been.
Ashley Halsey from the Post: "The Federal Aviation Administration would have had 180 days to come up with regulations on the width, padding and leg room each seat should provide.
While the overall Medicaid budgets were routinely exceeded and that should have caused legislat

In [0]:
print(len(FP))
print(len(FN))

48
66


In [0]:
print(FP[np.argmin([len(x) for x in FP])])
print(FP[np.argmax([len(x) for x in FP])])
print(FN[np.argmin([len(x) for x in FN])])
print(FN[np.argmax([len(x) for x in FN])])

I thought that if I was just doing what the doctor said, I'd be fine.
The decision has been criticised by opponents, who say that rather than thinking about how the law would affect people like himselfwell-off, white, well-educatedthe governor ought to have thought about less privileged folk, who might find themselves under pressure from relatives or health-care providers to take a quick and cheap way out.
Yet it could have been far worse.
It's become fashionable to tell a disability story in a hopeful arc, where the heroine may have moments of discouragement or fear, but comes out into full life at the end - into mainstream schools, love and romance, full participation in the social world, and these stories have become so pervasive that if they were to spread to aliens they'd find them familiar.


In [0]:
print(len(FP))
print(len(FN))

132
1441


In [0]:
!pip install tensorflow-hub
import tensorflow_hub as hub

