In [1]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

## Importing Datasets

In [2]:
dataset = pd.DataFrame(columns = ['phrase', 'intent'])

In [3]:
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open("./2017-06-custom-intent-engines/" + intent + "/train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Intent: {}, Length: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'phrase': text, 'intent': intent}, ignore_index=True)

Intent: AddToPlaylist, Length: 300
Intent: BookRestaurant, Length: 300
Intent: GetWeather, Length: 300
Intent: PlayMusic, Length: 300
Intent: RateBook, Length: 300
Intent: SearchCreativeWork, Length: 300
Intent: SearchScreeningEvent, Length: 300


In [4]:
dataset.intent.unique()

array(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic',
       'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'],
      dtype=object)

In [5]:
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)

In [6]:
list(label_to_ix.values())

[0, 1, 2, 3, 4, 5, 6]

## Feature Preparation

In [7]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [8]:
msg = "My dog is cute!"
prepare_features(msg)

NameError: name 'tokenizer' is not defined

## Checking RoBERTa lib

In [9]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 7,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [13]:
!ls

1.text_classifier_roberta.ipynb roberta-base-merges.txt
[1m[36m2017-06-custom-intent-engines[m[m   roberta-base-vocab.json


In [15]:
tokenizer = RobertaTokenizer(vocab_file = 'roberta-base-vocab.json', merges_file = 'roberta-base-merges.txt')
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [16]:
model = RobertaForSequenceClassification(config)
model.load_state_dict(torch.load('', map_location=device))

In [10]:
model = RobertaForSequenceClassification(config)

KeyboardInterrupt: 

In [None]:
msg = "My dog is really cute"
input_ids = torch.tensor(tokenizer.encode(msg)).unsqueeze(0) 
input_ids

In [None]:
outputs = model(input_ids)
len(outputs)

## Output of RoBERTa Model
- last_hidden_state
- pooler_output
- hidden_states (`optional`, returned when ``config.output_hidden_states=True``)
- attentions (`optional`, returned when ``config.output_attentions=True``)

## Output of RobertaForSequenceClassification Model
- **loss**
- **logits**
- **hidden_states** (`optional`, returned when ``config.output_hidden_states=True``)
 **attentions** (`optional`, returned when ``config.output_attentions=True``)

In [None]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        phrase = self.data.phrase[index]
        intent = self.data.intent[index]
        X, _  = prepare_features(phrase)
        y = label_to_ix[self.data.intent[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [None]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

In [None]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [None]:
training_set.__getitem__(0)[0].shape

In [None]:
model(training_set.__getitem__(0)[0])

## Training Params

In [None]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'num_workers': 1}

In [None]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [None]:
training_loader.batch_size

In [None]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)
max_epochs = 10

In [None]:
inp = training_set.__getitem__(0)[0]
inp

In [None]:
output = model(inp)[0]
output

In [None]:
torch.max(output.data, 1)

In [None]:
max_epochs = 10
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)   
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))