In [None]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook

try:
    from collections import OrderedDict
except ImportError:
    from ordereddict import OrderedDict

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler
print(torch.__version__)

## NLP libs
from nltk import download
import gensim

## PyTorch Transformer
from transformers import RobertaModel, RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig
from transformers.optimization import AdamW

stopwords = {"ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"}

In [None]:
dataset = pd.read_pickle('intents_phrases_183.pkl')
dataset = dataset.rename(columns={"usersays":"phrase"})
dataset.tail()

In [None]:
def transformText(text, do_stop=False, do_stem=False):
    # Convert text to lower
    text = text.lower()
    
    # Cleaning input
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stopwords]
    else:
        filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [None]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x, do_stop=True))
dataset.tail(5)

In [None]:
## Build label vocabulary
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
print("# Labels: {}".format(len(label_to_ix)))

In [None]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config.hidden_size = 1200

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)
config

In [None]:
def prepare_features(seq_1, zero_pad = False, max_seq_length = 300):
    enc_text = tokenizer.encode_plus(seq_1, add_special_tokens=True, max_length=300)
    if zero_pad:
        while len(enc_text['input_ids']) < max_seq_length:
            enc_text['input_ids'].append(0)
            enc_text['token_type_ids'].append(0)
    return enc_text

In [None]:
prepare_features("testing this loved", zero_pad = True)

In [None]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.preproc_text[index]
        label = self.data.intent[index]
        X = prepare_features(utterance, zero_pad = True)
        y = label_to_ix[self.data.intent[index]]
        return np.array(X['input_ids']), np.array(X['token_type_ids']), y
    
    def __len__(self):
        return self.len

In [None]:
train_size = 0.8
dataset = pd.concat([dataset, dataset]).reset_index(drop=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)


train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [None]:
# Dataloaders Parameters
params = {'batch_size': 2,
          'shuffle': True,
          'drop_last': True,
          'num_workers': 0}
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)
# Hyperparams
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer = AdamW(params=model.parameters(), lr=learning_rate)

In [None]:
ids, tokens, labels = next(iter(training_loader))

In [None]:
torch.cuda.is_available()

In [None]:
if torch.cuda.is_available():
    model = model.cuda()

In [None]:
out = model.forward(ids.cuda(),token_type_ids=tokens.cuda())[0]

In [None]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (ids, tokens, labels) in enumerate(training_loader):
        optimizer.zero_grad()
        if torch.cuda.is_available():
            ids = ids.cuda()
            tokens = tokens.cuda()
            labels = labels.cuda()
        output = model.forward(ids,token_type_ids=tokens)[0]
        loss = loss_function(output, labels)
        loss.backward()
        optimizer.step()

        if i%100 == 0:
            correct = 0
            total = 0
            for (ids, tokens, labels) in testing_loader:
                if torch.cuda.is_available():
                    ids = ids.cuda()
                    tokens = tokens.cuda()
                    labels = labels.cuda()
                output = model.forward(ids,token_type_ids=tokens)[0]
                _, predicted = torch.max(out.data, 1)
                total += labels.size(0)
                correct += (predicted.cpu() == labels.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))