In [42]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
import pickle

# Torch, Sklearn imports
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, RandomSampler

## NLP libs
from nltk import download
import gensim

import warnings
warnings.filterwarnings("ignore")

In [3]:
!ls -lah

total 67656
drwxr-xr-x  29 rsilvei  ADESI\Domain Users   928B Oct 28 16:09 [1m[36m.[m[m
drwxr-xr-x  24 rsilvei  ADESI\Domain Users   768B Aug 21 10:43 [1m[36m..[m[m
drwxr-xr-x  18 rsilvei  ADESI\Domain Users   576B Oct 28 13:22 [1m[36m.ipynb_checkpoints[m[m
-rw-r--r--   1 rsilvei  ADESI\Domain Users    30K Sep  9 17:08 1.text_classifier_roberta.ipynb
-rw-r--r--   1 rsilvei  ADESI\Domain Users    22K Oct 28 15:19 1.text_classifier_roberta_datsaset_resampler.ipynb
-rw-r--r--   1 rsilvei  ADESI\Domain Users    23K Aug 30 10:53 2.uncertainty_swag.ipynb
drwxr-xr-x@ 10 rsilvei  ADESI\Domain Users   320B Aug 19 09:36 [1m[36m2017-06-custom-intent-engines[m[m
-rw-r--r--   1 rsilvei  ADESI\Domain Users    28K Sep  5 11:24 3.causality_review.ipynb
-rw-r--r--   1 rsilvei  ADESI\Domain Users   112K Sep 23 15:45 4.dpp_diversity_phrases.ipynb
-rw-r--r--   1 rsilvei  ADESI\Domain Users   156K Sep 18 13:48 4.dpp_image.ipynb
-rw-r--r--   1 rsilvei  ADESI\Domain Users    19K Se

In [51]:
dataset = pd.read_csv('intent_186.csv')
dataset['phrase'] = dataset['TrainPhrase'].apply(lambda x: json.loads(x)['en-US'])
dataset.drop(columns = ['TrainPhrase'], inplace=True)
dataset.rename(columns = {"Intent":"intent"}, inplace = True)
dataset.tail(10)

Unnamed: 0,intent,phrase
2765,workerVeteranStatus.update,Worker is in the military
2766,workerVeteranStatus.update,She is in the military
2767,workerVeteranStatus.update,correct veteran status for her
2768,workerVeteranStatus.update,edit military service for him
2769,workerVeteranStatus.update,adjust veteran information for Vincent
2770,workerVeteranStatus.update,modify her military information
2771,workerVeteranStatus.update,modify worker military status
2772,workerVeteranStatus.update,change employee veteran status
2773,workerVeteranStatus.update,change his military status
2774,workerVeteranStatus.update,update Brian's veteran status


In [52]:
dataset.intent.value_counts()

positionRelationships.update                  49
workerMaritalStatus.update                    39
associateGovernmentRegistration.update        39
question.detect                               37
personMaritalStatus.update                    35
personBirthInformation.update                 33
workerTobaccoUsageStatus.update               32
personGovernmentRegistration.update           30
associateGovernmentRegistration.create        28
workerBirthInformation.update                 28
personTobaccoUsageStatus.update               27
workerPersonalEmail.update                    27
workerPersonalPhoneNumber.update              26
associateWageGarnishmentInstruction.create    26
workerLGBT.update                             25
personPersonalPhoneNumber.update              24
workerDeathDate.inform                        24
personEthnicity.update                        24
personLGBT.update                             23
worker.changeOrganization                     23
personGovernmentRegi

In [53]:
with open('intents_phrases_186.pkl', 'wb') as f:
    pickle.dump(dataset,f)

In [54]:
with open('intents_phrases_186.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [55]:
dataset.tail()

Unnamed: 0,intent,phrase
2770,workerVeteranStatus.update,modify her military information
2771,workerVeteranStatus.update,modify worker military status
2772,workerVeteranStatus.update,change employee veteran status
2773,workerVeteranStatus.update,change his military status
2774,workerVeteranStatus.update,update Brian's veteran status


In [47]:
#dataset_path = "intents_phrases_183.pkl"
#dataset = pd.read_pickle(dataset_path)
#dataset = dataset.rename(columns={"usersays":"phrase"})
#dataset.tail()

In [56]:
# ## Make shorter version of the dataset
# selected_intents = ['position.update',
#                     'jobBoard.update',
#                     'job.create',
#                     'lateralMove',
#                     'band.update',
#                     'adjustment',
#                    'worker.changeManager']
# dataset = dataset[dataset.intent.isin(selected_intents)].reset_index(drop=True)
# print(len(set(dataset.intent)))
# dataset.tail()

In [57]:
def resample_dataset(dataframe,
                     label_column = 'intent',
                     feature_column = 'phrase',
                     max_samples = 100):
    
    from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN

    ## Build label vocabulary
    label_to_ix = {}
    for label in dataset[label_column]:
        for word in label.split():
            if word not in label_to_ix:
                label_to_ix[word]=len(label_to_ix)
          
    ## Define Sampling Strategy based on number of samples
    classes_sample = {}
    for cls in list(set(label_to_ix.values())):
        classes_sample.update({cls:max_samples})

    sampler = RandomOverSampler(sampling_strategy = classes_sample, random_state=42)
    x = np.array(dataset.index).reshape(-1, 1)
    y = np.array(list(dataset[label_column].apply(lambda x: label_to_ix[x])))
    
    ## Oversampling
    x_resampled, y_resampled = sampler.fit_sample(x, y)
    dataset_resampled = pd.DataFrame(columns=[feature_column,label_column])
    
    ## Iterating
    for i, item in enumerate(x_resampled):
        row = {
            feature_column :dataset[feature_column].loc[item[0]],
            label_column: list(label_to_ix.keys())[y_resampled[i]]
        }
        dataset_resampled = dataset_resampled.append(row, ignore_index=True)
    return dataset_resampled

In [58]:
new_dataset = resample_dataset(dataset, max_samples = 50)

In [59]:
new_dataset.intent.value_counts()

worker.terminate                              50
workerTobaccoUsageStatus.update               50
job.create                                    50
worker.usI9Screening.section1.generate        50
jobPosting.cancel                             50
worker.usI9Screening.status.update            50
location.update                               50
jobFamily.deactivate                          50
band.delete                                   50
compensationPlan.create                       50
jobReferral.hold                              50
jobRequisitionRecruiter.assign                50
data.import                                   50
workerStudentStatus.update                    50
adjustment                                    50
personEthnicity.update                        50
worker.changeManager                          50
associateWageGarnishmentInstruction.create    50
workAssignment.create                         50
personDeathDate.inform                        50
legalEntity.activate

In [62]:
new_dataset[new_dataset['intent']=='jobOffer.revoke']

Unnamed: 0,phrase,intent
797,Pls rvk job offer,jobOffer.revoke
798,Please I want you to revoke job offer,jobOffer.revoke
799,I need you to revoke this job offer,jobOffer.revoke
800,Make a job offering revoked,jobOffer.revoke
801,Help me to revoke a job offering,jobOffer.revoke
802,Revoke this job offer,jobOffer.revoke
803,Pls revoke job offer,jobOffer.revoke
804,Let me revoke a job offer,jobOffer.revoke
805,Can you please help me to revoke this job offe...,jobOffer.revoke
806,Rvk job offer,jobOffer.revoke
