In [1]:
#!pip install imbalanced-learn

In [2]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook

try:
    from collections import OrderedDict
except ImportError:
    from ordereddict import OrderedDict

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler

## NLP libs
from nltk import download
import gensim

## PyTorch Transformer
import transformers

## Roberta
from transformers import RobertaModel, RobertaTokenizer
from transformers import RobertaForSequenceClassification, RobertaConfig

## DistilBert
from transformers import DistilBertModel, DistilBertTokenizer
from transformers import DistilBertForSequenceClassification, DistilBertConfig
from transformers.optimization import AdamW, WarmupLinearSchedule

import warnings
warnings.filterwarnings("ignore")
stopwords = {"ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"}

print(torch.__version__)
print(transformers.__version__)

1.1.0
2.1.1


In [3]:
!ls

1.text_classifier_roberta.ipynb      4.nips2019_papers_simple_altair.ipynb
1.text_classifier_roberta_NEW.ipynb  4.sample_dpp.ipynb
2017-06-custom-intent-engines	     intents_phrases_183.pkl
2.uncertainty_swag.ipynb	     intents_phrases_186.pkl
3.causality_review.ipynb	     model_elmo_swag_uncertainty.pth
4.dpp_diversity_phrases.ipynb	     nips_2018_bert.pkl
4.dpp_image.ipynb		     nips_2018_elmo.pkl
4.nips2019_papers.ipynb		     nips_2018.pkl


In [4]:
dataset_path = "intents_phrases_186.pkl"
dataset = pd.read_pickle(dataset_path)
dataset = dataset.rename(columns={"usersays":"phrase"})
dataset.tail()

Unnamed: 0,intent,phrase
2770,workerVeteranStatus.update,modify her military information
2771,workerVeteranStatus.update,modify worker military status
2772,workerVeteranStatus.update,change employee veteran status
2773,workerVeteranStatus.update,change his military status
2774,workerVeteranStatus.update,update Brian's veteran status


In [5]:
dataset.intent.value_counts()

positionRelationships.update              49
associateGovernmentRegistration.update    39
workerMaritalStatus.update                39
question.detect                           37
personMaritalStatus.update                35
                                          ..
location.read                              7
jobSearch.update                           6
view.job                                   6
jobSearch.create                           6
jobSearch.delete                           5
Name: intent, Length: 186, dtype: int64

In [6]:
# # Make shorter version of the dataset
# selected_intents = ['position.update',
#                     'jobBoard.update',
#                     'job.create',
#                     'lateralMove',
#                     'band.update',
#                     'adjustment',
#                    'worker.changeManager']
# dataset = dataset[dataset.intent.isin(selected_intents)].reset_index(drop=True)
# print(len(set(dataset.intent)))
# dataset.tail()

In [7]:
# dataset_path = './2017-06-custom-intent-engines/'
# ## Another dataset
# dataset = pd.DataFrame(columns = ['utterance', 'label'])
# for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
#                'SearchScreeningEvent']:
#     with open(dataset_path + intent + "/train_" + intent + ".json",
#               encoding='cp1251') as data_file:
#         data = json.load(data_file)
#     print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
#     texts = []
#     for i in range(len(data[intent])):
#         text = ''
#         for j in range(len(data[intent][i]['data'])):
#             text += data[intent][i]['data'][j]['text']
#         dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)
# dataset = dataset.rename(columns={"utterance":"phrase", "label":"intent"})
# dataset.tail()

## Over-sampling dataset

In [8]:
def resample_dataset(dataframe,
                     label_column = 'intent',
                     feature_column = 'phrase',
                     max_samples = 100):
    
    from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN

    ## Build label vocabulary
    label_to_ix = {}
    for label in dataset[label_column]:
        for word in label.split():
            if word not in label_to_ix:
                label_to_ix[word]=len(label_to_ix)
          
    ## Define Sampling Strategy based on number of samples
    classes_sample = {}
    for cls in list(set(label_to_ix.values())):
        classes_sample.update({cls:max_samples})

    sampler = RandomOverSampler(sampling_strategy = classes_sample, random_state=42)
    x = np.array(dataset.index).reshape(-1, 1)
    y = np.array(list(dataset[label_column].apply(lambda x: label_to_ix[x])))
    
    ## Oversampling
    x_resampled, y_resampled = sampler.fit_sample(x, y)
    dataset_resampled = pd.DataFrame(columns=[feature_column,label_column])
    
    ## Iterating
    for i, item in enumerate(x_resampled):
        row = {
            feature_column :dataset[feature_column].loc[item[0]],
            label_column: list(label_to_ix.keys())[y_resampled[i]]
        }
        dataset_resampled = dataset_resampled.append(row, ignore_index=True)
    return dataset_resampled

In [9]:
dataset = resample_dataset(dataset, max_samples = 50)
dataset.intent.value_counts()

assessment.take                               50
associateWageGarnishmentInstruction.create    50
jobOffer.revoke                               50
position.deactivate                           50
personPersonalAddress.update                  50
                                              ..
jobApplication.reject                         50
worker.terminate                              50
team.leave                                    50
worker.changeOrganization                     50
legalEntity.create                            50
Name: intent, Length: 186, dtype: int64

In [10]:
dataset.tail()

Unnamed: 0,phrase,intent
9295,modify her military information,workerVeteranStatus.update
9296,edit military service for him,workerVeteranStatus.update
9297,change his military status,workerVeteranStatus.update
9298,change his military status,workerVeteranStatus.update
9299,They're no longer in the military,workerVeteranStatus.update


## Cleaning Dataset

In [11]:
def transformText(text, do_stop=False, do_stem=False):
    # Convert text to lower
    text = text.lower()
    
    # Cleaning input
    text = text.replace("'s","")
    text = text.replace("’s","")
    text = text.replace("?","")
    text = text.replace("-","")
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    if (do_stop==True):
        filtered_words = [word for word in text.split() if word not in stopwords]
    else:
        filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    if (do_stem==True):
        # Stemming
        text = gensim.parsing.preprocessing.stem_text(text)
    return text

In [12]:
dataset['preproc_text'] = dataset['phrase'].apply(lambda x: transformText(x, do_stop=True))
dataset.tail(5)

Unnamed: 0,phrase,intent,preproc_text
9295,modify her military information,workerVeteranStatus.update,modify military information
9296,edit military service for him,workerVeteranStatus.update,edit military service
9297,change his military status,workerVeteranStatus.update,change military status
9298,change his military status,workerVeteranStatus.update,change military status
9299,They're no longer in the military,workerVeteranStatus.update,they re longer military


In [13]:
## Build label vocabulary
label_to_ix = {}
for label in dataset.intent:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
print("# Labels: {}".format(len(label_to_ix)))

# Labels: 186


In [14]:
model_type = 'roberta'
## Distilbert
if model_type == 'distilbert':
    print("DistilBERT")
    config = DistilBertConfig.from_pretrained('distilbert-base-uncased')
    config.num_labels = len(list(label_to_ix.values()))
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertForSequenceClassification(config)
    print(config)
elif model_type == 'roberta':
    print("RoBERTa")
    config = RobertaConfig.from_pretrained('roberta-base')
    config.num_labels = len(list(label_to_ix.values()))
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification(config)
    print(config)

RoBERTa
{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 186,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "use_bfloat16": false,
  "vocab_size": 50265
}



## New Model 

In [15]:
def prepare_features(seq_1, zero_pad = False, max_seq_length = 150):
    enc_text = tokenizer.encode_plus(seq_1, add_special_tokens=True, max_length=300)
    if zero_pad:
        while len(enc_text['input_ids']) < max_seq_length:
            enc_text['input_ids'].append(0)
            enc_text['token_type_ids'].append(0)
    return enc_text

In [16]:
prepare_features("testing this loved", zero_pad = True)

{'special_tokens_mask': [1, 0, 0, 0, 1],
 'input_ids': [0,
  33959,
  42,
  2638,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

In [17]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.preproc_text[index]
        label = self.data.intent[index]
        X = prepare_features(utterance, zero_pad = True)
        y = label_to_ix[self.data.intent[index]]
        return np.array(X['input_ids']), np.array(X['token_type_ids']), y
    
    def __len__(self):
        return self.len

In [18]:
dataset.tail(10)

Unnamed: 0,phrase,intent,preproc_text
9290,adjust veteran information for Vincent,workerVeteranStatus.update,adjust veteran information vincent
9291,Worker is in the military,workerVeteranStatus.update,worker military
9292,change his military status,workerVeteranStatus.update,change military status
9293,adjust veteran information for Vincent,workerVeteranStatus.update,adjust veteran information vincent
9294,Worker is in the military,workerVeteranStatus.update,worker military
9295,modify her military information,workerVeteranStatus.update,modify military information
9296,edit military service for him,workerVeteranStatus.update,edit military service
9297,change his military status,workerVeteranStatus.update,change military status
9298,change his military status,workerVeteranStatus.update,change military status
9299,They're no longer in the military,workerVeteranStatus.update,they re longer military


In [19]:
train_size = 0.8
# dataset = pd.concat([dataset, dataset]).reset_index(drop=True)
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()

Unnamed: 0,phrase,intent,preproc_text
0,edit my blood type,personBloodGroup.update,edit blood type
1,Add W4,us.taxWithholding.update,add w4
2,correct my blood group for employee,workerBloodGroup.update,correct blood group employee
3,respond to assessment,assessment.take,respond assessment
4,modify search,jobSearch.update,modify search


In [20]:
train_dataset=dataset.sample(frac=train_size,random_state=42).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [21]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [22]:
train_dataset.tail()

Unnamed: 0,phrase,intent,preproc_text
7435,eliminate job pos,position.delete,eliminate job pos
7436,update Anthony org,worker.changeOrganization,update anthony org
7437,Please submit job app internal,internalJobApplication.submit,please submit job app internal
7438,update Roberto's student status,workerStudentStatus.update,update roberto student status
7439,new campus create,campus.create,new campus create


In [23]:
train_dataset.intent.value_counts()

worker.retire                             47
location.create                           46
workerHeight.update                       46
workerBloodGroup.update                   46
jobBoard.create                           45
                                          ..
jobRequisition.cancel                     35
candidate.refer                           35
worker.usI9Screening.section2.generate    35
vendor.activate                           34
location.activate                         33
Name: intent, Length: 186, dtype: int64

In [24]:
test_dataset.intent.value_counts()

location.update           18
jobBoard.update           18
jobReferral.cancel        17
payGrade.deactivate       17
workerEthnicity.update    17
                          ..
campus.delete              5
jobApplication.reject      5
password.update            4
worker.changeLevel         4
worker.changeJob           3
Name: intent, Length: 186, dtype: int64

In [25]:
### Dataloaders Parameters
params = {'batch_size': 2,
          'shuffle': True,
          'drop_last': True,
          'num_workers': 0}
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)
loss_function = nn.CrossEntropyLoss()
learning_rate = 2e-06
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)
if torch.cuda.is_available():
    print("GPU is AVAILABLE!🤘🙌💪")
    model = model.cuda()

GPU is AVAILABLE!🤘🙌💪


In [26]:
ids, tokens, labels = next(iter(training_loader))
ids.shape, tokens.shape, labels.shape

(torch.Size([2, 150]), torch.Size([2, 150]), torch.Size([2]))

In [27]:
ids.shape

torch.Size([2, 150])

In [28]:
if model_type == 'roberta':
    print("RoBERTa")
    out = model.forward(ids.cuda(), token_type_ids=tokens.cuda(), head_mask=None)[0]
elif model_type == 'distilbert':
    print("DistilBERT")
    out = model.forward(ids.cuda())[0]
print(loss_function(out, labels.cuda()))
print(out.shape)

RoBERTa
tensor(5.3807, device='cuda:0', grad_fn=<NllLossBackward>)
torch.Size([2, 186])


In [29]:
max_epochs = 5
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (ids, tokens, labels) in enumerate(training_loader):
        optimizer.zero_grad()
        if torch.cuda.is_available():
            ids = ids.cuda()
            tokens = tokens.cuda()
            labels = labels.cuda()
        if model_type == 'roberta':
            output = model.forward(ids,token_type_ids=tokens)[0]
        elif model_type == 'distilbert':
            output = model.forward(ids)[0]
        loss = loss_function(output, labels)
        loss.backward()
        optimizer.step()
        if i%100 == 0:
            correct = 0
            total = 0
            for (ids, tokens, labels) in testing_loader:
                if torch.cuda.is_available():
                    ids = ids.cuda()
                    tokens = tokens.cuda()
                    labels = labels.cuda()
                if model_type == 'roberta':
                    output = model.forward(ids,token_type_ids=tokens)[0]
                elif model_type == 'distilbert':
                    output = model.forward(ids)[0]
                _, predicted = torch.max(output.data, 1)
                total += labels.size(0)
                correct += (predicted.cpu() == labels.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 5.142804145812988. Accuracy: 0.4838709677419355%
Iteration: 100. Loss: 5.146523475646973. Accuracy: 0.3763440860215054%
Iteration: 200. Loss: 5.4411516189575195. Accuracy: 0.6451612903225806%
Iteration: 300. Loss: 5.3452043533325195. Accuracy: 0.4838709677419355%
Iteration: 400. Loss: 5.518860816955566. Accuracy: 0.4838709677419355%
Iteration: 500. Loss: 5.2680864334106445. Accuracy: 0.4838709677419355%
Iteration: 600. Loss: 5.222952842712402. Accuracy: 0.5376344086021505%
Iteration: 700. Loss: 5.282913684844971. Accuracy: 0.43010752688172044%
Iteration: 800. Loss: 5.540057182312012. Accuracy: 0.6451612903225806%
Iteration: 900. Loss: 5.132499694824219. Accuracy: 0.5913978494623656%
Iteration: 1000. Loss: 5.313654899597168. Accuracy: 0.8064516129032258%
Iteration: 1100. Loss: 5.735642433166504. Accuracy: 0.10752688172043011%
Iteration: 1200. Loss: 4.8106794357299805. Accuracy: 0.8602150537634409%
Iteration: 1300. Loss: 5.398765563964844. Accuracy: 0.75268

EPOCH -- 3
Iteration: 0. Loss: 5.291882514953613. Accuracy: 0.5913978494623656%
Iteration: 100. Loss: 5.258023262023926. Accuracy: 0.7526881720430108%
Iteration: 200. Loss: 5.293707847595215. Accuracy: 0.5376344086021505%
Iteration: 300. Loss: 5.320935249328613. Accuracy: 0.3763440860215054%
Iteration: 400. Loss: 5.143135070800781. Accuracy: 0.5376344086021505%
Iteration: 500. Loss: 5.085205078125. Accuracy: 0.5376344086021505%
Iteration: 600. Loss: 5.149209022521973. Accuracy: 0.26881720430107525%
Iteration: 700. Loss: 5.2869157791137695. Accuracy: 0.4838709677419355%
Iteration: 800. Loss: 5.167595863342285. Accuracy: 0.5913978494623656%
Iteration: 900. Loss: 5.265769958496094. Accuracy: 0.5376344086021505%
Iteration: 1000. Loss: 5.130925178527832. Accuracy: 0.26881720430107525%
Iteration: 1100. Loss: 5.269412994384766. Accuracy: 0.4838709677419355%
Iteration: 1200. Loss: 5.3119120597839355. Accuracy: 1.2365591397849462%
Iteration: 1300. Loss: 5.449476718902588. Accuracy: 0.8064516129

## Parameters

### External dataset
| Model     | LR       | Batch Size   | Epochs    | Accuracy    |
| :---:     | :---:    | :---:        | :---:     |  :---:      |
| RoBERTa   |   1e-05  |     2        |    5      |   97.73%    | 
| RoBERTa   |   1e-05  |     4        |    5      |   98.57%    | 
| RoBERTa   |   2e-05  |     2        |    5      |   14.04%    | 
| RoBERTa   |   2e-05  |     4        |    5      |   13.69%    | 
 

### Internal dataset (7 intents)
| Model     | LR       | Batch Size   | Epochs    | Accuracy    |
| :---:     | :---:    | :---:        | :---:     |  :---:      |
| RoBERTa   |   1e-06  |     2        |    5      |  88.69%     | 
| RoBERTa   |   1e-06  |     4        |    5      |  59.64%     |
| RoBERTa   |   5e-06  |     2        |    5      |  100% (after 2 epochs)        | 
| RoBERTa   |   5e-06  |     4        |    5      |          |
| RoBERTa   |   1e-05  |     2        |    5      |          | 
| RoBERTa   |   1e-05  |     4        |    5      |          | 
| DistilBERT   |   1e-06  |     2        |    5      |       | 
| DistilBERT   |   1e-06  |     4        |    5      |       |
| DistilBERT   |   5e-06  |     2        |    5      |          | 
| DistilBERT   |   5e-06  |     4        |    5      |          |
| DistilBERT   |   1e-05  |     2        |    5      |          | 
| DistilBERT   |   1e-05  |     4        |    5      |          | 

### Internal dataset (complete)
| Model     | LR       | Batch Size   | Epochs    | Accuracy    |
| :---:     | :---:    | :---:        | :---:     |  :---:      |
| RoBERTa   |   1e-06  |     2        |    5      |   0.64%     | 
| RoBERTa   |   2e-06  |     2        |    5      |   9.13%     | 
| RoBERTa   |   3e-06  |     2        |    5      |   26.02%    | 
| RoBERTa   |   4e-06  |     2        |    5      |       | 
| RoBERTa   |   5e-06  |     2        |    5      |   12.68%    | 
| RoBERTa   |   7e-06  |     2        |    5      |   0.91%     | 
| RoBERTa   |   9e-06  |     2        |    5      |          | 

In [None]:
msg = "radiohead playlist"
input_msg = prepare_features(msg, zero_pad=True)
ids = torch.tensor(input_msg['input_ids'])
tokens = torch.tensor(input_msg['token_type_ids'])
ids
out = model.forward(ids.cuda(),token_type_ids=tokens.cuda())

#model.forward(input_msg['input_ids'],token_type_ids=input_msg['token_type_ids'])

In [None]:
def get_reply(msg):
    model.eval()
    input_msg, _ = prepare_features(msg)
    if torch.cuda.is_available():
        input_msg = input_msg.cuda()
    output = model(input_msg)[0]
    _, pred_label = torch.max(output.data, 1)
    prediction=list(label_to_ix.keys())[pred_label]
    return prediction