In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split


from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
from src import utils

# Import label encoder
from sklearn import preprocessing
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = utils.load_data("data/train.jsonl")

In [3]:
# Create sentence and label lists
sentences = df.postText.to_list()

In [4]:
sentences = [sentence[0] + " [SEP] [CLS]" for sentence in sentences]
labels = [i[0] for i in df.tags.values]

In [5]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [22]:
print(labels)
written_labels = label_encoder.inverse_transform(labels)
print(written_labels)

[1 2 2 ... 0 2 2]
['passage' 'phrase' 'phrase' ... 'multi' 'phrase' 'phrase']


In [None]:
decoding_array = {1: "passage",
2 : "phrase",
0 : "multi"}

In [6]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['▁we', 's', '▁', 'wel', 'ker', '▁wanted', '▁dinner', '▁with', '▁to', 'm', '▁bra', 'dy', ',', '▁but', '▁patriot', 's', '▁', 'q', 'b', '▁had', '▁better', '▁idea', '▁[', 's', 'ep', ']', '▁[', 'cl', 's', ']']


In [7]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
MAX_LEN = 128
# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [8]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)


In [9]:
print(train_inputs.shape)
print(validation_inputs.shape)
print(train_labels.shape)
print(validation_labels.shape)

(2880, 128)
(320, 128)
(2880,)
(320,)


In [10]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [11]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [12]:
# Load XLNEtForSequenceClassification, the pretrained XLNet model with a single linear classification layer on top. 

model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased", num_labels=3)

Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]



In [14]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5)



In [15]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [16]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 6

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    logits = outputs[1]
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      logits = output[0]
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Train loss: 1.0571460670895048


Epoch:  17%|█▋        | 1/6 [1:08:45<5:43:46, 4125.35s/it]

Validation Accuracy: 0.440625
Train loss: 1.0537956655025482


Epoch:  33%|███▎      | 2/6 [2:16:54<4:33:35, 4103.87s/it]

Validation Accuracy: 0.4375
Train loss: 1.0432039505905575


Epoch:  50%|█████     | 3/6 [3:26:10<3:26:23, 4127.72s/it]

Validation Accuracy: 0.53125
Train loss: 0.8903786096307966


Epoch:  67%|██████▋   | 4/6 [5:09:15<2:44:39, 4939.95s/it]

Validation Accuracy: 0.68125
Train loss: 0.7584818604919645


Epoch:  83%|████████▎ | 5/6 [6:44:13<1:26:53, 5213.35s/it]

Validation Accuracy: 0.68125
Train loss: 0.6399713009595871


Epoch: 100%|██████████| 6/6 [8:16:16<00:00, 4962.72s/it]  

Validation Accuracy: 0.659375





In [23]:
model.save_pretrained("xltask1_model.h5")

In [20]:
model.push_to_hub("EstrixDS/XLNet_SemEval_Task1",use_auth_token="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")

CommitInfo(commit_url='https://huggingface.co/EstrixDS/XLNet_SemEval_Task1/commit/51e56bb7e186aefca2c870907531d01b30525556', commit_message='Upload XLNetForSequenceClassification', commit_description='', oid='51e56bb7e186aefca2c870907531d01b30525556', pr_url=None, pr_revision=None, pr_num=None)

In [109]:
df = pd.DataFrame([
    {"uuid": "b2303ab7-f978-4576-b563-899f73397ed5", "postText": ["Guess who hasn't seen the Star Wars: The Force awakens trailer?"], "targetParagraphs": ["On YouTube alone, the teaser trailer for Star Wars: The Force Awakens has more than 12 million views, but you can count one person out of that number: George Lucas, the man who created the Star Wars universe.", "Page Six asked Lucas for his thoughts on the 88-second glimpse into the future of the galaxy far, far away, only to discover that he had none.", "\"I don't know anything about it,\" Lucas said. \"I haven't seen it yet.\"", "Asked why, he explained that it was \"Because it's not in the movie theater. I like going to the movies and watching the whole thing there.", "\"I plan to see it when it's released.\"", "The filmmaker sold Lucasfilm and its attending franchises to Disney in October 2012 in a deal worth more than $4 billion. At the announcement, Disney revealed that Star Wars: Episode VII was in production, based on Lucas' outline. According to Disney's announcement, Lucas would serve as a \"creative consultant\" for the franchise.", "Disney announced the cast of the next installment, which includes actors from the original trilogy, in April 2014. The first trailer hit on Black Friday. Directed by J.J. Abrams, Star Wars: The Force Awakens is slated to open Dec. 15, 2015. You can watch the teaser trailer below. For more on what it might mean, be sure to read Polygon's analysis."], "targetTitle": "George Lucas doesn't 'know anything about' the new Star Wars trailer", "targetDescription": "On YouTube alone, the teaser trailer for Star Wars: The Force Awakens has more than 12 million views, but you can count one person out of that number: George Lucas, the man who created the Star...", "targetUrl": "http://polygon.com/e/7119322", "provenance": {"source": "anonymized", "humanSpoiler": "George Lucas.", "spoilerPublisher": "SavedYouAClick"}, "spoiler": ["George Lucas"], "spoilerPositions": [[[0, 151], [0, 163]]], "tags": ["phrase"]},
    {"uuid": "09f9794e-134e-4e58-8ec2-8259ec40c136", "postText": ["Has \"Star Trek 3\" found its director?"], "targetParagraphs": ["Joe Cornish could replace J.J. Abrams as king of the \"Star Trek\" universe. That's the report from Deadline.com's Mike Fleming, who writes that Paramount \"is sweet\" on the idea of Cornish directing the franchise's next installment.", "This isn't the first time Cornish, who directed the cult hit \"Attack the Block\" and co-wrote the script for \"Ant-Man\" with Edgar Wright, has had his name attached to \"Star Trek 3.\" Back in May, Latino Review reporter Umberto \"El Mayimbe\" Gonzalez tweeted that Cornish was under consideration as a possible replacement for Abrams, who is next directing \"Star Wars: Episode VII.\"", "I guess y'all wanna know about who might be directing STAR TREK 3 if it ever goes. Heard Joe Cornish BUT also heard he's on a list of names. — elmayimbe (@elmayimbe) May 23, 2013", "I'm NOT saying Joe Cornish is the guy, but what I am saying is the he is definitely one of NUMEROUS contenders. — elmayimbe (@elmayimbe) May 23, 2013", "Other reported contenders for \"Star Trek 3\" have included Jon M. Chu and Rupert Wyatt. In an email to HuffPost Entertainment, however, Chu's representatives denied that \"G.I. Joe: Retaliation\" director was up for the job. Wyatt's involvement was never confirmed or denied, but Abrams did discuss the \"Rise of the Planet of the Apes\" director in an interview with HitFix.", "\"Whomever it is that directs the film will be someone we all know is going to keep the cast and crew in good hands,\" Abrams told Collider back in September. \"I feel very lucky to have been part of it, and it definitely feels like the right time to let someone come in and do their own thing. I certainly don’t want someone to come in and try to do what I would have done. We want to hire someone who's gonna come in and bring their own sensibility. I'm very excited to see what comes next, despite feeling jealous of whomever that person is.\"", "HuffPost Entertainment contacted Cornish's representatives for comment on the Deadline.com rumor; this post will be updated if they respond. For more on Cornish, meanwhile, head to Deadline.com.", "[via Deadline.com]"], "targetTitle": "Joe Cornish Rumored For 'Star Trek 3' Director Job", "targetDescription": "Joe Cornish could replace J.J. Abrams as king of the \"Star Trek\" universe. That's the report from Deadline.com's Mike Fleming,", "targetUrl": "http://huff.to/1aVGhr4", "provenance": {"source": "anonymized", "humanSpoiler": "This article doesn't know but the rumor is Joe Cornish", "spoilerPublisher": "HuffPoSpoilers"}, "spoiler": ["Joe Cornish could replace J.J. Abrams as king of the \"Star Trek\" universe."], "spoilerPositions": [[[0, 0], [0, 74]]], "tags": ["passage"]},
])

# Create sentence and label lists
sentences = df.postText.values

# We need to add special tokens at the beginning and end of each sentence for XLNet to work properly
sentences = [sentence[0] + " [SEP] [CLS]" for sentence in sentences]
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


MAX_LEN = 128
# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
  
batch_size = 32  


prediction_data = TensorDataset(prediction_inputs, prediction_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  decoding_array = {  
        1: "passage",
        2 : "phrase",
        0 : "multi"
        }
  for z in predictions[0]:
    label = np.where(z == z.max())[0][0]
    decoded_label = decoding_array[label]

passage
multi
