In [None]:
#!pip install imbalanced-learn

In [1]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook

try:
    from collections import OrderedDict
except ImportError:
    from ordereddict import OrderedDict

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler

## NLP libs
from nltk import download
import gensim

## PyTorch Transformer
import transformers

## Roberta
from transformers import RobertaModel, RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig

## DistilBert
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import DistilBertForSequenceClassification, DistilBertConfig
from transformers.optimization import AdamW, WarmupLinearSchedule

from timebudget import timebudget
timebudget.report_atexit()  # Generate report when the program exits

import warnings
warnings.filterwarnings("ignore")
stopwords = {"ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"}

print(torch.__version__)
print(transformers.__version__)

1.1.0
2.1.1


In [2]:
!ls

1.text_classifier_roberta.ipynb      4.nips2019_papers_simple_altair.ipynb
1.text_classifier_roberta_NEW.ipynb  4.sample_dpp.ipynb
2017-06-custom-intent-engines	     intents_phrases_183.pkl
2.uncertainty_swag.ipynb	     intents_phrases_186.pkl
3.causality_review.ipynb	     model_elmo_swag_uncertainty.pth
4.dpp_diversity_phrases.ipynb	     nips_2018_bert.pkl
4.dpp_image.ipynb		     nips_2018_elmo.pkl
4.nips2019_papers.ipynb		     nips_2018.pkl


In [3]:
dataset_path = "intents_phrases_186.pkl"
dataset = pd.read_pickle(dataset_path)
dataset = dataset.rename(columns={"usersays":"phrase"})
dataset.tail()

Unnamed: 0,intent,phrase
2770,workerVeteranStatus.update,modify her military information
2771,workerVeteranStatus.update,modify worker military status
2772,workerVeteranStatus.update,change employee veteran status
2773,workerVeteranStatus.update,change his military status
2774,workerVeteranStatus.update,update Brian's veteran status


In [4]:
dataset.intent.value_counts()

positionRelationships.update              49
workerMaritalStatus.update                39
associateGovernmentRegistration.update    39
question.detect                           37
personMaritalStatus.update                35
                                          ..
location.read                              7
view.job                                   6
jobSearch.update                           6
jobSearch.create                           6
jobSearch.delete                           5
Name: intent, Length: 186, dtype: int64

In [5]:
# # Make shorter version of the dataset
# selected_intents = ['position.update',
#                     'jobBoard.update',
#                     'job.create',
#                     'lateralMove',
#                     'band.update',
#                     'adjustment',
#                    'worker.changeManager']
# dataset = dataset[dataset.intent.isin(selected_intents)].reset_index(drop=True)
# print(len(set(dataset.intent)))
# dataset.tail()

In [6]:
# dataset_path = './2017-06-custom-intent-engines/'
# ## Another dataset
# dataset = pd.DataFrame(columns = ['utterance', 'label'])
# for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
#                'SearchScreeningEvent']:
#     with open(dataset_path + intent + "/train_" + intent + ".json",
#               encoding='cp1251') as data_file:
#         data = json.load(data_file)
#     print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
#     texts = []
#     for i in range(len(data[intent])):
#         text = ''
#         for j in range(len(data[intent][i]['data'])):
#             text += data[intent][i]['data'][j]['text']
#         dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)
# dataset = dataset.rename(columns={"utterance":"phrase", "label":"intent"})
# dataset.tail()

## Over-sampling dataset

In [7]:
def resample_dataset(dataframe,
                     label_column = 'intent',
                     feature_column = 'phrase',
                     max_samples = 100):
    
    from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN

    ## Build label vocabulary
    label_to_ix = {}
    for label in dataset[label_column]:
        for word in label.split():
            if word not in label_to_ix:
                label_to_ix[word]=len(label_to_ix)
          
    ## Define Sampling Strategy based on number of samples
    classes_sample = {}
    for cls in list(set(label_to_ix.values())):
        classes_sample.update({cls:max_samples})

    sampler = RandomOverSampler(sampling_strategy = classes_sample, random_state=42)
    x = np.array(dataset.index).reshape(-1, 1)
    y = np.array(list(dataset[label_column].apply(lambda x: label_to_ix[x])))
    
    ## Oversampling
    x_resampled, y_resampled = sampler.fit_sample(x, y)
    dataset_resampled = pd.DataFrame(columns=[feature_column,label_column])
    
    ## Iterating
    for i, item in enumerate(x_resampled):
        row = {
            feature_column :dataset[feature_column].loc[item[0]],
            label_column: list(label_to_ix.keys())[y_resampled[i]]
        }
        dataset_resampled = dataset_resampled.append(row, ignore_index=True)
    return dataset_resampled

In [8]:
dataset = resample_dataset(dataset, max_samples = 50)
dataset.intent.value_counts()

personVeteranStatus.update                50
personCitizenship.update                  50
personDeathDate.inform                    50
band.activate                             50
workerStartDate.update                    50
                                          ..
legalEntity.deactivate                    50
termination.revoke                        50
position.update                           50
team.leave                                50
associateGovernmentRegistration.update    50
Name: intent, Length: 186, dtype: int64

In [9]:
dataset.tail()

Unnamed: 0,phrase,intent
9295,modify her military information,workerVeteranStatus.update
9296,edit military service for him,workerVeteranStatus.update
9297,change his military status,workerVeteranStatus.update
9298,change his military status,workerVeteranStatus.update
9299,They're no longer in the military,workerVeteranStatus.update


## Cleaning Dataset

In [10]:
def transformText(text, do_stop=False, do_stem=False):
    # Convert text to lower
    text = text.lower()
    
    # Cleaning input
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stopwords]
    else:
        filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [67]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x, do_stop=False))
dataset.tail(5)

Unnamed: 0,phrase,intent,preproc_text
9295,I'm unmarried now,personMaritalStatus.update,i m unmarried now
9296,schedule evaluation,evaluation.schedule,schedule evaluation
9297,launch pay band,band.activate,launch pay band
9298,add some sick days to compensation,compensationPlan.create,add some sick days to compensation
9299,I divorced,personMaritalStatus.update,i divorced


In [69]:
## Build label vocabulary
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
print("# Labels: {}".format(len(label_to_ix)))

# Labels: 186


In [None]:
model_type = 'roberta'
## Distilbert
if model_type == 'distilbert':
    print("DistilBERT")
    config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
    config.num_labels = len(list(label_to_ix.values()))
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification(config)
    print(config)
elif model_type == 'roberta':
    print("RoBERTa")
    config = RobertaConfig.from_pretrained('roberta-base')
    config.num_labels = len(list(label_to_ix.values()))
    config.num_hidden_layers = 4
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification(config)
    print(config)

RoBERTa


## New Model 

In [25]:
def prepare_features(seq_1, zero_pad = False, max_seq_length = 150):
    enc_text = tokenizer.encode_plus(seq_1, add_special_tokens=True, max_length=300)
    if zero_pad:
        while len(enc_text['input_ids']) < max_seq_length:
            enc_text['input_ids'].append(0)
            enc_text['token_type_ids'].append(0)
    return enc_text

In [26]:
prepare_features("testing this loved", zero_pad = True)

{'special_tokens_mask': [1, 0, 0, 0, 1],
 'input_ids': [0,
  33959,
  42,
  2638,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [27]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.preproc_text[index]
        label = self.data.intent[index]
        X = prepare_features(utterance, zero_pad = True)
        y = label_to_ix[self.data.intent[index]]
        return np.array(X['input_ids']), np.array(X['token_type_ids']), y
    
    def __len__(self):
        return self.len

In [28]:
dataset.tail(10)

Unnamed: 0,phrase,intent,preproc_text
9290,change legal entity,legalEntity.update,change legal entity
9291,update my sexual orientation,personLGBT.update,update sexual orientation
9292,close,logout,close
9293,say no to job app,jobApplication.reject,say job app
9294,job board correction,jobBoard.update,job board correction
9295,complete pre-screening,assessment.submit,complete prescreening
9296,Setup new job,job.create,setup new job
9297,new worker form I9 section 1,worker.usI9Screening.section1.generate,new worker form i9 section 1
9298,activation of a new job,job.activate,activation new job
9299,edit position level,level.update,edit position level


In [29]:
train_size = 0.8
# dataset = pd.concat([dataset, dataset]).reset_index(drop=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()

Unnamed: 0,phrase,intent,preproc_text
0,Let me create new job offer,jobOffer.create,let create new job offer
1,questions help please,helper.commands,questions help please
2,include new paygrade,payGrade.create,include new paygrade
3,correct job posting,jobPosting.update,correct job posting
4,shut off job family,jobFamily.deactivate,shut job family


In [30]:
train_dataset=dataset.sample(frac=train_size,random_state=42).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [31]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [32]:
train_dataset.tail()

Unnamed: 0,phrase,intent,preproc_text
7435,to-dos for next week,my.todos,todos next week
7436,modify personal contacts,personPersonalContacts.update,modify personal contacts
7437,Bruno is male,workerGender.update,bruno male
7438,passed away on September 2000,personDeathDate.inform,passed away september 2000
7439,modify campus,campus.update,modify campus


In [33]:
train_dataset.intent.value_counts()

personBirthInformation.update         46
level.create                          46
personPersonalPhoneNumber.update      46
my.todos                              45
logout                                45
                                      ..
password.update                       35
team.update                           34
campus.delete                         33
worker.changeManager                  33
worker.usI9Screening.status.update    30
Name: intent, Length: 186, dtype: int64

In [34]:
test_dataset.intent.value_counts()

personGovernmentRegistration.update    17
worker.changeOrganization              16
personName.update                      16
workerStartDate.update                 16
job.update                             15
                                       ..
jobFamily.create                        5
band.deactivate                         5
personStudentStatus.update              5
jobPosting.create                       2
jobApplication.reject                   2
Name: intent, Length: 186, dtype: int64

In [35]:
### Dataloaders Parameters
params = {'batch_size': 4,
          'shuffle': True,
          'drop_last': True,
          'num_workers': 0}
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)
loss_function = nn.CrossEntropyLoss()
learning_rate = 3.5e-06
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)
if torch.cuda.is_available():
    print("GPU is AVAILABLE!🤘🙌💪")
    model = model.cuda()

GPU is AVAILABLE!🤘🙌💪


In [36]:
ids, tokens, labels = next(iter(training_loader))
ids.shape, tokens.shape, labels.shape

(torch.Size([4, 150]), torch.Size([4, 150]), torch.Size([4]))

In [37]:
if model_type == 'roberta':
    print("RoBERTa")
    out = model.forward(ids.cuda(), token_type_ids=tokens.cuda(), head_mask=None)[0]
elif model_type == 'distilbert':
    print("DistilBERT")
    out = model.forward(ids.cuda())[0]
print(loss_function(out, labels.cuda()))
print(out.shape)

RoBERTa
tensor(5.2800, device='cuda:0', grad_fn=<NllLossBackward>)
torch.Size([4, 186])


In [38]:
optimizer.param_groups[0]['lr']

3.5e-06

In [39]:
@timebudget
def train(model, epochs):
    max_epochs = epochs
    model = model.train()
    for epoch in tqdm_notebook(range(max_epochs)):
        print("EPOCH -- {}".format(epoch))
        for i, (ids, tokens, labels) in enumerate(training_loader):
            optimizer.zero_grad()
            if torch.cuda.is_available():
                ids = ids.cuda()
                tokens = tokens.cuda()
                labels = labels.cuda()
            if model_type == 'roberta':
                output = model.forward(ids,token_type_ids=tokens)[0]
            elif model_type == 'distilbert':
                output = model.forward(ids)[0]
            loss = loss_function(output, labels)
            loss.backward()
            optimizer.step()
            if i%500 == 0:
                correct = 0
                total = 0
                for (ids, tokens, labels) in testing_loader:
                    if torch.cuda.is_available():
                        ids = ids.cuda()
                        tokens = tokens.cuda()
                        labels = labels.cuda()
                    if model_type == 'roberta':
                        output = model.forward(ids,token_type_ids=tokens)[0]
                    elif model_type == 'distilbert':
                        output = model.forward(ids)[0]
                    _, predicted = torch.max(output.data, 1)
                    total += labels.size(0)
                    correct += (predicted.cpu() == labels.cpu()).sum()
                accuracy = 100.00 * correct.numpy() / total
                print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))
    return "Training finished!"

In [40]:
train(model, 20)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 5.197153091430664. Accuracy: 0.5376344086021505%
Iteration: 500. Loss: 5.293965816497803. Accuracy: 0.3225806451612903%
Iteration: 1000. Loss: 5.110079765319824. Accuracy: 0.43010752688172044%
Iteration: 1500. Loss: 5.379077434539795. Accuracy: 0.5913978494623656%
EPOCH -- 1
Iteration: 0. Loss: 5.486606597900391. Accuracy: 0.43010752688172044%
Iteration: 500. Loss: 5.320595741271973. Accuracy: 0.4838709677419355%
Iteration: 1000. Loss: 5.325439929962158. Accuracy: 0.5913978494623656%
Iteration: 1500. Loss: 5.194285869598389. Accuracy: 0.7526881720430108%
EPOCH -- 2
Iteration: 0. Loss: 5.413905620574951. Accuracy: 0.9139784946236559%
Iteration: 500. Loss: 5.2936506271362305. Accuracy: 0.8602150537634409%
Iteration: 1000. Loss: 5.287839889526367. Accuracy: 0.7526881720430108%
Iteration: 1500. Loss: 5.284292221069336. Accuracy: 1.5591397849462365%
EPOCH -- 3
Iteration: 0. Loss: 5.176022529602051. Accuracy: 1.6666666666666667%
Iteration: 500. Loss: 4.98792934

'Training finished!'

In [43]:
type(timebudget.report())

timebudget report...
                    train: 1511706.63ms for      1 execs


NoneType

In [44]:
(1511706/1000)/60

25.1951

In [61]:
transformText("I moved", do_stop=True)

'moved'

In [45]:
def get_reply(msg):
    features = prepare_features(msg, zero_pad = True)
    ids = torch.tensor(features['input_ids']).unsqueeze(0).cuda()
    tokens = torch.tensor(features['token_type_ids']).unsqueeze(0).cuda()
    output = model.forward(ids,token_type_ids=tokens)[0]
    _, predicted = torch.max(output.data, 1)
    return list(label_to_ix.keys())[predicted]

In [60]:
msg = "I moved"
for i in range(10):
    print(get_reply(msg))

workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update
workerPersonalAddress.update


## Parameters

### External dataset
| Model     | LR       | Batch Size   | Epochs    | Accuracy    |
| :---:     | :---:    | :---:        | :---:     |  :---:      |
| RoBERTa   |   1e-05  |     2        |    5      |   97.73%    | 
| RoBERTa   |   1e-05  |     4        |    5      |   98.57%    | 
| RoBERTa   |   2e-05  |     2        |    5      |   14.04%    | 
| RoBERTa   |   2e-05  |     4        |    5      |   13.69%    | 
 

### Internal dataset (7 intents)
| Model     | LR       | Batch Size   | Epochs    | Accuracy    |
| :---:     | :---:    | :---:        | :---:     |  :---:      |
| RoBERTa   |   1e-06  |     2        |    5      |  88.69%     | 
| RoBERTa   |   1e-06  |     4        |    5      |  59.64%     |
| RoBERTa   |   5e-06  |     2        |    5      |  100% (after 2 epochs)        | 
| RoBERTa   |   5e-06  |     4        |    5      |          |
| RoBERTa   |   1e-05  |     2        |    5      |          | 
| RoBERTa   |   1e-05  |     4        |    5      |          | 
| DistilBERT   |   1e-06  |     2        |    5      |       | 
| DistilBERT   |   1e-06  |     4        |    5      |       |
| DistilBERT   |   5e-06  |     2        |    5      |          | 
| DistilBERT   |   5e-06  |     4        |    5      |          |
| DistilBERT   |   1e-05  |     2        |    5      |          | 
| DistilBERT   |   1e-05  |     4        |    5      |          | 

### Internal dataset (complete)
| Model     | LR       | Batch Size   | Epochs    | Accuracy    |  Hiddden Layers |  Training  Time |
| :---:     | :---:    | :---:        | :---:     |  :---:      |    :---:        |      :---:      |
| RoBERTa   |   1e-06  |     2        |    5      |   0.64%     |      12         |        na       |
| RoBERTa   |   2e-06  |     2        |    5      |   9.13%     |      12         |        na       |
| RoBERTa   |   3e-06  |     2        |    5      |   26.02%    |      12         |        na       |
| RoBERTa   |   4e-06  |     2        |    5      |   45.91%    |      12         |        na       |
| RoBERTa   |   5e-06  |     2        |    5      |   12.68%    |      12         |        na       |
| RoBERTa   |   7e-06  |     2        |    5      |   0.91%     |      12         |        na       |
| RoBERTa   |   3.5e-06  |     4        |    8      |  62.20%   |      12         |        na       |
| RoBERTa   |   3.5e-06  |     4        |    12      |   89.67% |      6          |      25 min     |
| RoBERTa   |   3.5e-06  |     4        |    12      |   82.09% |      5          |      23.36 min  |
| RoBERTa   |   3.5e-06  |     4        |    12      |   82.04% |      4          |      18.76 min  |
| RoBERTa   |   3.5e-06  |     4        |    16      |   94.35% |      4          |      25 min     |
| RoBERTa   |   3.5e-06  |     4        |    20      |   95.86% |      3          |      25.19 min  |
| RoBERTa   |   3.5e-06  |     4        |    25 (18)      |   99.46% |      4          |      111 min  |

In [None]:
msg = "radiohead playlist"
input_msg = prepare_features(msg, zero_pad=True)
ids = torch.tensor(input_msg['input_ids'])
tokens = torch.tensor(input_msg['token_type_ids'])
ids
out = model.forward(ids.cuda(),token_type_ids=tokens.cuda())

#model.forward(input_msg['input_ids'],token_type_ids=input_msg['token_type_ids'])

In [None]:
def get_reply(msg):
    model.eval()
    input_msg, _ = prepare_features(msg)
    if torch.cuda.is_available():
        input_msg = input_msg.cuda()
    output = model(input_msg)[0]
    _, pred_label = torch.max(output.data, 1)
    prediction=list(label_to_ix.keys())[pred_label]
    return prediction