In [1]:
!pip install torch pandas numpy simpletransformers nltk





In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk import sent_tokenize


In [2]:
with open('../data/qanta_train.json') as train_f:
  train_json_data = json.load(train_f)['questions']


In [3]:
vocab = sorted(list(set([q['page'] for q in train_json_data])))
vocab.append('kUNK')
word2idx = {}
idx2word = {}

for i, word in enumerate(vocab):
  word2idx[word] = i
  idx2word[i] = word

def word_to_idx(word):
  if word in word2idx:
    return word2idx[word]
  else:
    return len(vocab)

def idx_to_word(idx):
  if idx == len(vocab):
    return 'kUNK'
  else:
    return idx2word[idx]

In [5]:
i = 0
with open('../data/qanta_train_runs.txt', 'w', encoding='utf-8') as train_txt:
    for q in train_json_data:
        text = q['text'].replace("\n", " ")
        text = text.replace("\t", " ")
        text = text.replace("\r", " ")
        
        sents = sent_tokenize(text)
        run = ""
        
        label = str(word_to_idx(q['page']))
        
        for sent in sents:
#             run += " " + sent
#             train_txt.write(run + '|#|' + label +'\n')
            train_txt.write(sent + '|#|' + label +'\n')

# check formatting
with open('../data/qanta_train_runs.txt', 'r', encoding='utf-8') as train_txt:   
    for line in train_txt:
        if '|#|' not in line:
            print(line)


In [6]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model_args = {
    'num_train_epochs': 10,
    'lazy_loading': True,
    'lazy_loading_start_line': 0,
    'max_length': 512,
    'save_steps': -1,
    'truncation': True,
    'lazy_delimiter': '|#|',
    'output_dir': 'roberta_runs_output3',
}

model = ClassificationModel(
    'roberta',
    'roberta-base',
    num_labels=len(vocab) + 1,
    args=model_args,
) 

model.train_model('../data/qanta_train_runs.txt')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  torch.nn.utils.clip_grad_norm_(


Running Epoch 1 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/71711 [00:00<?, ?it/s]

(717110, 3.9531878683908945)

In [7]:
with open('../data/qanta_test.json') as test_f:
  test_json_data = json.load(test_f)['questions']
test_data = [[q['text'], word_to_idx(q['page'])] for q in test_json_data]
test_df = pd.DataFrame(test_data, columns=['text', 'labels'])

texts = [text for [text, label] in test_data]
labels = np.array([label for [text, label] in test_data])

predictions, raw_outputs = model.predict(texts)

print(np.sum(predictions == labels), 'of', len(texts), 'correct')
predictions

  0%|          | 0/4104 [00:00<?, ?it/s]

  0%|          | 0/513 [00:00<?, ?it/s]

783 of 4104 correct


array([ 8039, 21573, 25083, ...,  1146,  1919, 17767], dtype=int64)

In [8]:
with open('../data/qanta_dev.json') as test_f:
  test_json_data = json.load(test_f)['questions']
test_data = [[q['text'], word_to_idx(q['page'])] for q in test_json_data]
test_df = pd.DataFrame(test_data, columns=['text', 'labels'])

texts = [text for [text, label] in test_data]
labels = np.array([label for [text, label] in test_data])

predictions, raw_outputs = model.predict(texts)

print(np.sum(predictions == labels), 'of', len(texts), 'correct')
predictions

  0%|          | 0/2216 [00:00<?, ?it/s]

  0%|          | 0/277 [00:00<?, ?it/s]

506 of 2216 correct


array([ 7159, 19148,  8133, ..., 23409,  5314, 13634], dtype=int64)

In [9]:
np.where(predictions == word_to_idx('kUNK'))

(array([], dtype=int64),)

In [4]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model = ClassificationModel(
    'roberta',
    './roberta_runs_output3',
    num_labels=len(vocab) + 1,
#     args=model_args,
) 

In [8]:
prompt = """
Robert Walker argued that failing to take this action would lead to an overflow of Northern insane asylums and British intervention.
Juan Almonte resigned his diplomatic post in indignation over this event. 
Isaac Van Zandt began discussing this plan with Abel Upshur before Upshur died. 
Anson Jones proposed this plan, which reduced his own power but reserved the weaker side the right to split
into five parts in the future. Five years after it, a payment of 10 million dollars helped the area at issue repay debts. 
The Regulator-Moderator war was calmed just before this deal was struck. 
The question of whether the Nueces River became a southern border in this transaction led to war later in James K Polk's presidency.
For 10 points, name this deal that ended the independent Lone Star Republic.
"""

predictions, raw_outputs = model.predict(["For 10 points, name this deal that ended the independent Lone Star Republic."])

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
[idx_to_word(x) for x in predictions]

['Gadsden_Purchase']