In [None]:
"https://colab.research.google.com/drive/1xg4UMQmXjDik3v9w-dAsk4kq7dXX_0Fm#scrollTo=-n3p7ncn6Xst"  #source link

In [1]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [38]:
!pip install pytorch-transformers
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.5.1 tokenizers-0.12.1 transformers-4.18.0


In [82]:
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig 




In [4]:
import yaml
import pandas as pd 




In [83]:
def parse_yaml(file_path):
    labels_number = 0
    a_yaml_file = open(file_path)
    parsed_yaml_file = yaml.load(a_yaml_file, Loader=yaml.FullLoader)

    df = pd.DataFrame(columns = ["Text", "label"])
    intents = parsed_yaml_file.get("intents")
    for intent in intents :
        sentences = intent["examples"]["en"]
        label = intent["name"]
        labels_number = labels_number + 1
        for sentence in sentences : 
            df = df.append(pd.Series([sentence, label], index=['Text','label']), ignore_index=True)

    return(df,labels_number)



In [84]:
parse_yaml("flow.yaml")[0]


Unnamed: 0,Text,label
0,good morning,greeting
1,hi,greeting
2,hello,greeting
3,how are you doing?,greeting
4,good evening,greeting
5,I want to cnacel my subscripton,cancel subscription
6,I do not want to get any new messages from you,cancel subscription
7,would you please cancel my subscriotion?,cancel subscription
8,Can I hold my subscription,cancel subscription
9,Please do not send me more emails,cancel subscription


In [85]:
label_to_ix = {}
for label in parse_yaml("flow.yaml")[0].label:
        if label not in label_to_ix:
            label_to_ix[label]=len(label_to_ix)
label_to_ix

{'cancel subscription': 1, 'greeting': 0, 'thanks': 2}

In [86]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

{
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 1,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [87]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

In [88]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)
   

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)

    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [50]:
msg = "My dog is cute!"
prepare_features(msg)

(tensor([[    0,  2646, 10269,    83, 99942,    38,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [89]:
from typing import Text
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        Text = self.data.Text[index]
        label = self.data.label[index]
        X, _  = prepare_features(Text)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [90]:
train_size = 0.8
train_dataset= parse_yaml("flow.yaml")[0].sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset= parse_yaml("flow.yaml")[0].drop(train_dataset.index).reset_index(drop=True)

In [91]:
print("FULL Dataset: {}".format(parse_yaml("flow.yaml")[0].shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (14, 2)
TRAIN Dataset: (11, 2)
TEST Dataset: (3, 2)


In [92]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [93]:
training_set.__getitem__(0)[0].shape

torch.Size([1, 7])

In [94]:
model(training_set.__getitem__(0)[0])

(tensor([[-0.2019,  0.0441, -0.1673]], grad_fn=<AddmmBackward0>),)

In [95]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cpu()

In [96]:
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [97]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [98]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [99]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cpu()
output = model(inp)[0]
print(output.shape)

torch.Size([1, 3])


In [100]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/3 [00:00<?, ?it/s]

EPOCH -- 0
Iteration: 0. Loss: 0.9939078688621521. Accuracy: 0.0%
EPOCH -- 1
Iteration: 0. Loss: 1.083217740058899. Accuracy: 100.0%
EPOCH -- 2
Iteration: 0. Loss: 0.5837330222129822. Accuracy: 33.333333333333336%


In [30]:
torch.save(model.state_dict(), ''+ str(uuid4())+'.pth')

In [31]:
model_path = '/content/44843f8e-57ca-4fbb-a87b-9119bd4112cb.pth'

In [32]:
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [33]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction

In [34]:
label_to_ix.keys()

dict_keys(['greeting', 'cancel subscription', 'thanks'])

In [35]:
get_reply("I want to cancel my subscripton	")

'cancel subscription'