In [10]:
import numpy as np
import pandas as pd
import random
import os
import json

<h2>Functions</h2>

<h3>General Functions</h3>

In [11]:
# Check if a Value is Nan
# param x - Value to check
# return boolean - If it is a Nan
def is_nan(x):
    return (x is np.nan or x != x)

# Check All Values in List are Nan
# param x - List to check
# return boolean - If it is all Nan
def list_is_nan(x):
    for i in x:
        if not is_nan(i): return False
    return True

# Converts names to single string
# param names (list) - List of names in the format [['Mia', 'Brown'],[..]]
# return unique (list) - List of names in the format ['Mia Brown', ..]
def name_to_str(names):
    return [",".join(name).replace(",", " ") for name in names]

# Converts names to single string
# param names (list) - List of names in the format ['Mia Brown', ..]
# return names_list (list) - List of names in the format [['Mia', 'Brown'],[..]]
def str_name_to_list(names):
    for i in range(len(names)):
        full_name = names[i].split()
        names[i] = [full_name[0], full_name[len(full_name) - 1]]
    return names

# Extracts All of the Elements in a List of (One or Two D) Lists
# param two_d_list (list) List of (One or Two D) Lists
# return new_elems (list) List of elements
def extract_2d_list(two_d_list):
    elems, new_elems = [], []
    for elem in two_d_list: elems.extend(elem)
    for elem in elems:
        if isinstance(elem, list): new_elems.extend(elem)
        else: new_elems.append(elem)
    return new_elems

<h3>Get Unique Values</h3>

In [12]:
# Get the Names of Employees in the HR Dataset
# param df (dataframe) - Dataframe with names
# param column (str) - Column can be 'Employee Name' or 'Manager Name'
# Returns names (list) - Array of names (['First', 'Last'])
def get_names(df, column='Employee Name'):
    names = [df[[column]].iloc[i][0] for i in range(df.shape[0] - 1)]
    for i in range(len(names)):
        try:
            full_name = names[i].split(',')
            first = full_name[1].strip().split()[0]
            last = full_name[0]
            names[i] = [first, last]
        except:
            names[i] = ['Jeremy', 'Prater']
    return names

# Filter to list of unique names
# param names (list) - List of names in the format [['Mia', 'Brown'],[..]]
# param avoid (list) - List of names to avoid duplicates of
# return unique (list) - List of names in the format [['Mia', 'Brown'],[..]]
def filter_names_helper(names, avoid = []):
    unique = []
    first, last = [], []
    if(len(avoid) != 0):
        first = [n[0] for n in avoid]
        last = [n[1] for n in avoid]
    for name in names:
        if((name[0] not in first) and (name[1] not in last)):
            first.append(name[0])
            last.append(name[1])
            unique.append(name)
    return unique

# Filter to list of unique names that don't overlap with manager names
# param df (dataframe) - Dataframe with names 
# return unique (list) - List of names in the format [['Mia', 'Brown'],[..]]
def get_filtered_names(df):
    managers = str_name_to_list(get_uniq_str(df, "Manager Name"))
    managers = filter_names_helper(managers)
    survey_users = [['Mia','Brown'], ['Ivan','Rogers'], ['Julia','Soto'], ['Nan','Singh']]
    user_names = get_names(hr_data)
    user_names = filter_names_helper(user_names, avoid=survey_users)
    user_names = filter_names_helper(user_names, avoid=managers)
    user_names.extend(survey_users)
    return user_names

# Update Name Synonyms
def name_syn_update(ent_dict):
    syn_names = []
    for name in ent_dict['name'][0]:
        name_splt = name.split()
        name_syn = [name.lower(), name_splt[0], name_splt[0].lower(),
                    name_splt[1], name_splt[1].lower()]
        syn_names.append(name_syn)
    return syn_names

In [13]:
# Get the Unique Names of Employees in the HR Dataset
# param df (dataframe) - Dataframe with names
# return unique (list) - List of names in the format ['Mia Brown', ..]
def get_uniq_names(df):
    return filter_names(get_names(df))

# Get Unique String Values of a Dataframe Column
# param df (dataframe) - Source Dataframe
# param col_name (str) - Name of Column to get unique values'
# return uniq_arr (arr) - Array of Unique Values
def get_uniq_str(df, col_name):
    return [i.lower().strip() for i in df[col_name].unique()[:-1]]

# Get Unique String Values of a Dataframe Column
# param df (dataframe) - Source Dataframe
# param col_name (str) - Name of Column to get unique values'
# return uniq_arr (arr) - Array of Unique Values
def get_uniq_num(df, col_name):
    return [str(int(i)) for i in df[col_name].unique()[:-1]]

# Generate a Dictionary with Unique Values for Select Columns
# param df (dataframe) - Source Dataframe
# return uniq (dict) - Dictionary that maps entities to unique values
def gen_uniq_dict(df):
    # Get Unique Values of Relevant Columns
    uniq = {}
    # Predefined
    uniq['name'] = name_to_str(get_filtered_names(df))
    uniq['sex'] = ['male', 'female']
    uniq['employment_status'] = ['active', 'voluntarily terminated', 'terminated for a cause',
                          'on a leave of absence' + 'going to start work in the future']
    uniq['performance_score'] = ['fully meet performance expecations', 'are too early to review', 
                         'meet 90-day expectations', 'are exceptional', 'need improvement', 'exeed expecations']
    # Custom Preprocessing
    uniq['state'] = hr_data['State'].unique()[:-1]
    # Standard Preprocessing 
    uniq['age'] = get_uniq_num(hr_data, 'Age')
    uniq['maritaldesc'] = get_uniq_str(hr_data, "MaritalDesc")
    uniq['citizendesc'] = get_uniq_str(hr_data, "CitizenDesc")
    uniq['racedesc'] = get_uniq_str(hr_data, "RaceDesc")
    uniq['department'] = get_uniq_str(hr_data, "Department")
    uniq['position'] = get_uniq_str(hr_data, "Position")
    #uniq['manager'] = name_to_str(str_name_to_list(get_uniq_str(hr_data, "Manager Name")))
    uniq['employee_source'] = get_uniq_str(hr_data, "Employee Source")
    return uniq

# Generate a Dictionary with Unique Values for Select Columns
# param uniq (dict) - Dictionary that maps entities to unique values
# param ent_dict (dict) - Dictionary that Contains Entity Information
# return uniq (dict) - Dictionary that maps entities to unique values
def uniq_dict_update(uniq, ent_dict):
    uniq['money'] = ent_dict['money'][1][0]
    uniq['time_interval'] = extract_2d_list(ent_dict['time_interval'])
    uniq['time_recur'] = extract_2d_list(ent_dict['time_recur'])
    uniq['function'] = extract_2d_list(ent_dict['function'])
    uniq['extreme'] = extract_2d_list(ent_dict['extreme'])
    uniq['employment_action'] = extract_2d_list(ent_dict['employment_action'])
    uniq['date_compare'] = extract_2d_list(ent_dict['date_compare'])
    uniq['manager'] = extract_2d_list(ent_dict['manager'])
    return uniq

In [14]:
#extract_2d_list(ent_dict['manager'])

<h3>Load, Parse and Clean Entity Data</h3>

In [15]:
# Load entity data frame from CSV
# param path (str) - Path to csv
# return ent_df (dataframe) - Entity Info Dataframe
def load_ent(path):
    df = pd.read_csv(path).iloc[:, :4]
    ct = 0
    while (ct < len(df) and not list_is_nan(df.iloc[ct, :].values)): ct += 1
    return df.iloc[:ct, :]

# Parse CSV that contains [Entity, Options, Synonyms, Gazetteer]
# param ent_df (dataframe) - Contains the columns listed above
# return ent_dict (dict) - Uncleaned dictionary where the keys are the entities
#                          and the values are [options (arr), synonyms (arr), gazetteer (arr)]
def ent_parse(ent_df):
    ent_dict = {}
    ct = 0
    curr_ent = ''
    while(ct < len(ent_df)):
        row = ent_df.iloc[ct]
        # update curr_ent if new entity
        if(not is_nan(row['Entity'])): 
            curr_ent = row['Entity']
            ent_dict[curr_ent] = [[], [], []]
        # check if options is a list
        if(not is_nan(row['Options']) and row['Options'].count(',') > 2):
            ent_dict[curr_ent][0] = row['Options']
            if(not is_nan(row['Synonyms'])): 
                ent_dict[curr_ent][1] = row['Synonyms'].split(',')
            if(not is_nan(row[3])): ent_dict[curr_ent][2] = row['Gazetteer']
        # single entry in option column
        else:
            ent_dict[curr_ent][0].append(row['Options'])
            ent_dict[curr_ent][1].append(row['Synonyms'])
            ent_dict[curr_ent][2].append(row['Gazetteer'])
        ct += 1
    return ent_dict

# Cleans the gazetteers of the ent dictionary
# param ent_dict (dictionary) - Entity Dictionary
# return ent_dict (dictionary) - Cleaned Entity Dictionary
def clean_options(ent_dict):
    for i in ent_dict.keys():
        old_opt = ent_dict[i][0]
        new_opt = []
        for word in old_opt:
            if(not is_nan(word)):
                new_opt.append(word.replace("'", "").replace(",", ""))
        ent_dict[i][0] = new_opt
    return ent_dict

# Cleans the gazetteers of the ent dictionary
# param ent_dict (dictionary) - Entity Dictionary
# return ent_dict (dictionary) - Cleaned Entity Dictionary
def clean_synonyms(ent_dict):
    for i in ent_dict.keys():
        old_syn = ent_dict[i][1]
        new_syn = []
        if(len(old_syn) != 0):
            for j in old_syn:
                if(not is_nan(j)): new_syn.append([string.strip() for string in j.split(',')])
                else: new_syn.append(j)
        ent_dict[i][1] = new_syn
    return ent_dict

# Cleans the gazetteers of the ent dictionary
# param ent_dict (dictionary) - Entity Dictionary
# return ent_dict (dictionary) - Cleaned Entity Dictionary
def clean_gazetteer(ent_dict):
    for i in ent_dict.keys():
        old_gaz = ent_dict[i][2]
        clean_gaz = []
        if (len(old_gaz) != 0):        
            if (type(old_gaz) is str): clean_gaz = old_gaz.split(',')
            elif(not is_nan(old_gaz[0])): clean_gaz = (old_gaz[0].split(','))
        ent_dict[i][2] = [i.strip() for i in clean_gaz]
    return ent_dict

# Cleans an Entity Dictionary
# ent_dict (dict) - Uncleaned dictionary of parsed entity information
# uniq (dict) - Dictionary that maps entities to unique values
# return cleaned_ent_dict (dict) - Cleaned dictionary
def clean_ent_dict(ent_dict, uniq):
    ent_dict = clean_options(ent_dict)
    ent_dict = clean_synonyms(ent_dict)
    ent_dict = clean_gazetteer(ent_dict)
    ent_dict['name'][0] = uniq['name']
    ent_dict['state'][0] = uniq['state']
    ent_dict['age'][0] = uniq['age']
    return ent_dict

# Parse and Clean Entity Data
# Parse CSV that contains [Entity, Options, Synonyms, Gazetteer]
# param ent_df (dataframe) - Contains the columns listed above
def parse_clean_ent(ent_df, uniq):
    ent = ent_parse(ent_df)
    return clean_ent_dict(ent, uniq)

<h3>File Generation</h3>

In [16]:
# Convert a list into a text file
# param folder (str) - Name of the folder to write the file
# param lines (list) - List of values to write to file
def list_to_txt_file(folder, lines):
    directory = '../hr_assistant/entities/' + folder + '/'
    if not os.path.exists(directory): os.makedirs(directory)
    with open(directory + 'gazetteer.txt', 'w+') as filehandle:  
        filehandle.writelines("%s\n" % line for line in lines)
        
# Convert a dict into a json file
# param folder (str) - Name of the folder to write the file
# param dict (dict) - Json dict to write to file
def dict_to_json_file(folder, json_dict):
    directory = '../hr_assistant/entities/' + folder + '/'
    if not os.path.exists(directory): os.makedirs(directory)
    with open(directory + "mapping.json", "w+") as f:
        json_str = json.dumps(json_dict, indent=4)
        f.write(json_str)
        
# Generate a Mapping JSON Dict to create Mapping.json file
# param ent_dict (dict) - Dictionary that maps entities to options, synonyms, and gazetteer
# param entity (str) - The entity to generate the gazetteer list for
# return json_dict (dict) - Json Dict in the proper format for mapping.json
def gen_map_json(ent_dict, entity):
    opt = ent_dict[entity][0]
    syn = ent_dict[entity][1]
    # Check if the Options and the Synonyms Align
    if(len(opt) != len(syn)): syn = [[] for i in range(len(opt))]
    return gen_map_json_helper(opt, syn)

# Helper function to Generate a Mapping JSON Dictionary
# param options (list) - Array of options
# param synonyms (2d list) - Array of synonym arrays corresponding to options
# return json_dict (dict) - Json Dict in the proper format for mapping.json
def gen_map_json_helper(options, synonyms):
    json_dict = {}
    json_dict['entities'] = []
    for i in range(len(options)):
        new_dict = {}
        if(is_nan(synonyms[i])): new_dict['whitelist'] = []
        else: new_dict['whitelist'] = synonyms[i]
        new_dict['cname'] = options[i]
        json_dict['entities'].append(new_dict)
    return json_dict

# Create Mapping.json files for Every Entity in an Entity Dictionary
# param ent_dict (dict) - Dictionary that maps entities to options, synonyms, and gazetteer
def gen_map_json_files(ent_dict):
    for entity in ent_dict.keys():
        json_dict = gen_map_json(ent_dict, entity)
        if(not is_nan(json_dict)): dict_to_json_file(entity, json_dict)

# Collect All Synonyms for a Single Entity in Entity Dict
# param ent_dict (dict) - Dictionary that maps entities to options, synonyms, and gazetteer
# param entity (str) - The entity to get Synyonyms for
# return synonyms (list) - List of Synonyms of Specified entity
def get_synonyms(ent_dict, entity):
    synonyms = []
    for syn in ent_dict[entity][1]:
        if(not is_nan(syn)): synonyms.extend(syn)
    return synonyms

# Generate a Gazetter List for an Entity
# param ent_dict (dict) - Dictionary that maps entities to options, synonyms, and gazetteer
# param entity (str) - The entity to generate the gazetteer list for
# return gaz (list) - List of gazetteer word relevant to the entity
def gen_gaz_list(ent_dict, entity):
    gaz = []
    gaz.extend(ent_dict[entity][0])
    gaz.extend(get_synonyms(ent_dict, entity))
    gaz.extend(ent_dict[entity][2])
    return gaz

# Create Gazetteer Files for Every Entity in an Entity Dictionary
# param ent_dict (dict) - Dictionary that maps entities to options, synonyms, and gazetteer
def gen_gazetteers(ent_dict):
    for entity in ent_dict.keys():
        gaz_list = gen_gaz_list(ent_dict, entity)
        list_to_txt_file(entity, gaz_list)

<h3>Debug Labelling</h3>

In [17]:
# Find all labels in a sentence
# pram text (str) String that may or may not contain labels in the form of {ent_type|ent}
# return labels (list) List of labels found in the text
def get_labels(text):
    chars = list(text)
    start, stop = 0, 0
    labels = []
    positions = []
    for i in range(len(chars)):
        if chars[i] == '{': 
            start = i
        if chars[i] == '}':
            stop = i
            txt = chars[start:(stop + 1)]
            labels.append("".join(chars[start:(stop + 1)]))
            positions.append([start, stop])
    return labels, positions

# Given a list of labels, Seperate Key and Values
# param labels (list) List of labels found in the text
# return kv (2d list) List of the Keys list and Values list [[key_array], [value_array]]
def get_kv(labels):
    k = []
    v = []
    for label in labels:
        if '|' in label:
            k.append(label.split('|')[0].replace("{", ""))
            v.append(label.split('|')[1].replace("}", ""))
    return [k,v]

# Check if a Synonym Exists in a Sentence that it is labelled
# param sentence (str) String that may or may not contain labels in the form of {ent_type|ent}
# param ent_dict (dict) - Dictionary that maps entities to options, synonyms, and gazetteer
# param kv_labels (2d list) List of the Keys list and Values list [[key_array], [value_array]]
# param col (str) Column in the dataframe
# param idx (int) Index of the Current Sentence in the Dataframe
def label_chk_helper(sentence, ent_dict, kv_labels, col, idx):
    sent_split = [word.lower() for word in sentence.split()]
    for ent in ent_dict:
        # Check for 'Entity Options' in text
        options = ent_dict[ent][0]
        for opt in options:
            if opt in sent_split and opt not in kv_labels[0] and opt not in kv_labels[1]:
                print("POTENTIAL OPTION-MISSING LABEL")
                print("OPTION: " + opt)
                print("ENTITY: " + ent)
                print("SENTENCE: " + sentence)
                print("Column: " + col)
                print("Index: " + str(idx + 2))
                print("==========================================")
        # Check for 'Synonym Options' in text
        syns = get_synonyms(ent_dict, ent)
        for syn in syns:
            if syn.lower() in sent_split and syn not in kv_labels[0]:
                print("POTENTIAL SYNONYM-MISSING LABEL")
                print("SYNONYM: " + syn)
                print("ENTITY: " + ent)
                print("SENTENCE: " + sentence)
                print("Column: " + col)
                print("Index: " + str(idx + 2))
                print("==========================================")
                
# Finds mismatches between labels in the Dataframe
# param kv_labels (2d list) List of the Keys list and Values list [[key_array], [value_array]]
# param sentence (str) String that may or may not contain labels in the form of {ent_type|ent}
# param l_dict (dict) Dictionary that keeps track of entity label mappings passed in from the main function
# param col (str) Column in the dataframe
# param idx (int) Index of the Current Sentence in the Dataframe
def chk_mismatch(labels, sentence, l_dict, col, idx):
    if(len(labels[0]) != 0):
        for ct in range(len(labels[0])):
            k = labels[0][ct]
            v = labels[1][ct]
            
            if k in l_dict: 
                if l_dict[k] != v:
                    print("POTENTIAL MISMATCH for: " + k)
                    print("CURRENT LABEL: " + v)
                    print("IN DICT: " + l_dict[k])
                    print("SENTENCE: " + sentence)
                    print("Column: " + col)
                    print("Index: " + str(idx + 2))
                    print("==========================================")
            else: l_dict[k] = v  

# Check if the Synonyms to Entities Were not Labelled in Training
# param df (dataframe) - Dataframe where each column is an intent and each row has sentence examples
# param ent_dict (dict) - Dictionary that contains entities as keys and synonyms
def label_chk(df, ent_dict):
    for col in df:
        print(col.upper() + "=============================================================")
        idx = 2
        nan = is_nan(df[col][idx])
        l_dict = {}
        while(idx < len(df)):
            if(is_nan(df[col][idx])): break
            sentence = df[col][idx]
            labels, pos = get_labels(sentence)
            labels = get_kv(labels)
            label_chk_helper(sentence, ent_dict, labels, col, idx)
            chk_mismatch(labels, sentence, l_dict, col, idx)
            idx += 1

<h2>Workflow</h2>

<h4>Load Data</h4>

In [20]:
hr_data = pd.read_csv('old_core_dataset.csv')
hr_data

Unnamed: 0,Employee Name,Employee Number,State,Zip,DOB,Age,Sex,MaritalDesc,CitizenDesc,Hispanic/Latino,...,Date of Hire,Date of Termination,Reason For Term,Employment Status,Department,Position,Pay Rate,Manager Name,Employee Source,Performance Score
0,"Brown, Mia",1.103024e+09,MA,1450.0,11/24/1985,32.0,Female,Married,US Citizen,No,...,10/27/2008,,N/A - still employed,Active,Admin Offices,Accountant I,28.50,Brandon R. LeBlanc,Diversity Job Fair,Fully Meets
1,"LaRotonda, William",1.106027e+09,MA,1460.0,4/26/1984,33.0,Male,Divorced,US Citizen,No,...,1/6/2014,,N/A - still employed,Active,Admin Offices,Accountant I,23.00,Brandon R. LeBlanc,Website Banner Ads,Fully Meets
2,"Steans, Tyrone",1.302053e+09,MA,2703.0,9/1/1986,31.0,Male,Single,US Citizen,No,...,9/29/2014,,N/A - still employed,Active,Admin Offices,Accountant I,29.00,Brandon R. LeBlanc,Internet Search,Fully Meets
3,"Howard, Estelle",1.211051e+09,MA,2170.0,9/16/1985,32.0,Female,Married,US Citizen,No,...,2/16/2015,4/15/2015,N/A - still employed,Active,Admin Offices,Administrative Assistant,21.50,Brandon R. LeBlanc,Pay Per Click - Google,N/A- too early to review
4,"Singh, Nan",1.307060e+09,MA,2330.0,5/19/1988,29.0,Female,Single,US Citizen,No,...,5/1/2015,,N/A - still employed,Active,Admin Offices,Administrative Assistant,16.56,Brandon R. LeBlanc,Website Banner Ads,N/A- too early to review
5,"Smith, Leigh Ann",7.110077e+08,MA,1844.0,6/14/1987,30.0,Female,Married,US Citizen,No,...,9/26/2011,9/25/2013,career change,Voluntarily Terminated,Admin Offices,Administrative Assistant,20.50,Brandon R. LeBlanc,Diversity Job Fair,Fully Meets
6,"LeBlanc, Brandon R",1.102024e+09,MA,1460.0,6/10/1984,33.0,Male,Married,US Citizen,No,...,1/5/2016,,N/A - still employed,Active,Admin Offices,Shared Services Manager,55.00,Janet King,Monster.com,Fully Meets
7,"Quinn, Sean",1.206043e+09,MA,2045.0,11/6/1984,33.0,Male,Married,Eligible NonCitizen,No,...,2/21/2011,8/15/2015,career change,Voluntarily Terminated,Admin Offices,Shared Services Manager,55.00,Janet King,Diversity Job Fair,Fully Meets
8,"Boutwell, Bonalyn",1.307060e+09,MA,2468.0,4/4/1987,30.0,Female,Married,US Citizen,No,...,2/16/2015,,N/A - still employed,Active,Admin Offices,Sr. Accountant,34.95,Brandon R. LeBlanc,Diversity Job Fair,90-day meets
9,"Foster-Baker, Amy",1.201031e+09,MA,2050.0,4/16/1979,38.0,Female,Married,US Citizen,no,...,1/5/2009,,N/A - still employed,Active,Admin Offices,Sr. Accountant,34.95,Board of Directors,Other,Fully Meets


In [13]:
#hr_data.columns

<h4>Create Unique Values Dictionary</h4>

In [21]:
uniq = gen_uniq_dict(hr_data)

In [15]:
#uniq

<h4>Load Entities</h4>

In [22]:
ent_df = load_ent('HR Manager Schema - Entities.csv')
ent_dict = parse_clean_ent(ent_df, uniq)
ent_dict['name'][1] = name_syn_update(ent_dict)
uniq = uniq_dict_update(uniq, ent_dict)
ent_dict

{'name': [['William LaRotonda',
   'Tyrone Steans',
   'Estelle Howard',
   'Leigh Smith',
   'Brandon LeBlanc',
   'Sean Quinn',
   'Bonalyn Boutwell',
   'Amy Foster-Baker',
   'Janet King',
   'Jennifer Zamora',
   'Renee Becker',
   'Taisha Goble',
   'Daniff Hernandez',
   'Jayne Horton',
   'Noelle Johnson',
   'Thomas Murray',
   'Randall Pearson',
   'Thelma Petrowsky',
   'Lori Roby',
   'Jason Salter',
   'Kramer Simard',
   'Simon Roup',
   'Ricardo Ruiz',
   'Peter Monroe',
   'Eric Dougall',
   'Rick Clayton',
   'Lisa Galia',
   'Leonara Lindsay',
   'Alejandro Bacong',
   'Anthony Cisco',
   'Linda Dolan',
   'Maria Gonzalez',
   'Carlos Merlos',
   'Tanya Morway',
   'Anita Shepard',
   'Neville Tredinnick',
   'Jumil Turpin',
   'Karthikeyan Ait Sidi',
   'Claudia Carr',
   'Donald Favis',
   'Bianca Roehrich',
   'Ann Daniele',
   'Jyoti Lajiri',
   'Jeremiah Semizoglou',
   'Joe South',
   'Sarah Warfield',
   'Elisa Bramante',
   'Michael Albert',
   'Charles Bozzi'

In [56]:
ent_dict['name'][1][240]

['andrew szabo', 'Andrew', 'andrew', 'Szabo', 'szabo']

<h4>Labelling Check</h4>

In [23]:
intent_txt = pd.read_csv('HR Manager Schema - intent_master.csv')
intent_txt = intent_txt.iloc[2:, :]
intent_txt

Unnamed: 0,get_info,get_aggregate,get_employees,get_salary,get_salary_aggregate,get_salary_employees,get_date,get_date_range_aggregate,get_date_range_employees,get_hierarchy
2,what is {Phylicia Gosciminski|name}s org role,{percent|function} employees {below|comparator...,{female|sex} employees,Amount that {Julia|name} gets {paid|money},among all of the employees that found their jo...,Which {Sr. DBA|position} {earns|money} the {mo...,What year was {Lily DiNocco|name} {let go|empl...,{percent|function} of employees {born|dob} in ...,I want {male|sex} {born|dob} in the {1930s|tim...,can i have the names of employees who report t...
3,What position is {ivan|name} in?,{count|function} of workers are {less than|com...,employees {hispanic|racedesc}?,{joanne handschiegl|name} {each month|time_rec...,{sum|function} {pay|money} for {female|sex}?,Which employee(s) have {lowest|extreme} {incom...,Has {Sarah Warfield|name} been working here fo...,{1974|sys_time} {born|dob} employees {percent|...,Which employees did we {get rid of|employment_...,is {Charles Bozzi|name} the {mentor|manager} f...
4,Why did {Megan|name} get {fired|employment_act...,{How many|function} employees are {C-levels|po...,get me the {youngest|extreme} {five|sys_number...,{Mia|name} {earns|money} what amount {each day...,give me the {mean|function} {salary|money} for...,give me the {earners|money} for all of the emp...,What was the exact date when {desiree|name} wa...,What {percentage|function} of employees were {...,{forties|time_interval} {born|dob} employees w...,is {Peter Monroe|name} {managing|manager} {Amy...
5,Which department is {adrienne homberger|name} in?,Gimme the {percent|function} of {50|age} year ...,give me a list of {separated employees|marital...,What does {mia|name}'s {paycheck|money} look l...,what is the {highest|extreme} {amount|money} t...,give me the {earnings|money} for all of the em...,How long has Mr.{Knapp|name} worked here?,{1945|sys_time} {born|dob} employees {percent|...,Which are the employees such that in {2005|sys...,who is {helen billis|name}s {managing|manager}...
6,is {abdellah veera|name} a {cio|position} or not?,{How many|function} people are {performing bad...,employees that live in {california|state},{webster|name} is {earning|money} what amount ...,What is the {total|function} {earnings|money} ...,what are {network engineers|position} {making|...,Has {Nicole|name} been working here for {4 yea...,What {percent|function} of employees were {hir...,i want the employees that have been {hired |em...,which employees is {Ivan singh|name} the {mana...
7,Is {Mohammed Latif|name} a citizen of the us?,{cumulative|function} {count|function} of empl...,Which employees have been {terminated|employme...,"Does {54,000|sys_number} exceed what {jessica|...",what are {women|sex} {making|money} on {averag...,all the {earnings|money} of {female|sex} in th...,What was the date when {ivan rogers|name} was ...,What {percentage|function} of employees have b...,Give me the employees that have a {join date|e...,Who are those employees that are {under|compar...
8,Where does {ivan rogers|name} live?,{average|function} age of the employees that a...,Who {has worked here|employment_action} based ...,{ivan|name} {Salary|money} {Yearly|time_recur},"of all the {sales manager|position}s, what is ...",who {makes|money} the {minimum|extreme} {incom...,When {Sophia Theamstern|name} was {hired|emplo...,can you please tell me what {fraction|function...,Fetch me a list of workers that have their {bi...,I want to know if {Amy Dunn|name} is a {manger...
9,I want {sarah warfield|name}'s state,What's the {summed|function} {num of|function}...,give me a list of employees that are based in ...,What does {Rose Ivey|name} get for {income|mon...,get me the {average|function} amount that the ...,Get me the {lowest|extreme} {six|sys_number} {...,Fetch me {Francesco Barone|name}'s {Bday|dob},What {pct|function} of our staff have a {bday|...,get me {senior database admins|position} {born...,who is the {manager|manager} assigned to {luis...
10,how did {dawn|name} hear about our corporation,What is the {total|function} {number of|functi...,Which employees have been with the company lon...,"According to the {payroll|money}, how much doe...",{number of|function} people {earning|money} {f...,which employees are {making|money} {less than|...,{Leigh Smith|name} {date of birth|dob},What {percentage|function} of employees were {...,I want all of the employees in the {sales depa...,I want to know if {Sam Athwal|name} {works for...
11,Does {Michael|name} {still work at|employment_...,{average|function} age of workers who are {old...,employees are {under|comparator} {45|sys_numbe...,What is {Brooke oliver|name}'s {each year|time...,give me the {typical|function} take home {sala...,{non-citizen|citizendesc} {paycheck|money}s,{Ashley Rose|name} {birthday|dob},I want the {total|function} {number of|functio...,Can you tell me whether there are any {June|sy...,{Jenna Dietrich|name} is the {supervisor|manag...


In [24]:
label_chk(intent_txt, ent_dict)

POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: what is {Phylicia Gosciminski|name}s org role
Column: get_info
Index: 4
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What position is {ivan|name} in?
Column: get_info
Index: 5
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: Which department is {adrienne homberger|name} in?
Column: get_info
Index: 7
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: is {abdellah veera|name} a {cio|position} or not?
Column: get_info
Index: 8
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: citizen
ENTITY: citizendesc
SENTENCE: Is {Mohammed Latif|name} a citizen of the us?
Column: get_info
Index: 9
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: Is {Mohammed Latif|name} a citizen of the us?
Column: get_info
Index: 9
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: is {Ivan Rogers|name} {currently active|employment_statu

Column: get_aggregate
Index: 38
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: meet
ENTITY: performance_score
SENTENCE: What {typically|function} is the age of employees who {are too early to review|performance_score} or currently {fully meet performance expectations|performance_score}?
Column: get_aggregate
Index: 38
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What is the {distribution|function} of employees from different states?
Column: get_aggregate
Index: 40
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What is the {total|function} {number of|function} {us citizens|citizendesc}?
Column: get_aggregate
Index: 41
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What is the {percentage|function} of employees that {exceeds expectations|performance_score} on his or her {performance score|performance_score}?
Column: get_aggregate
Index: 57
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: Wha

POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: who is not {widowed|maritaldesc}
Column: get_employees
Index: 135
POTENTIAL OPTION-MISSING LABEL
OPTION: manager
ENTITY: manager
SENTENCE: {it manager - db|position} employees
Column: get_employees
Index: 147
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: manager
ENTITY: manager
SENTENCE: {it manager - db|position} employees
Column: get_employees
Index: 147
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: Who is {married|maritaldesc} in {Collaboration Department|department}?
Column: get_employees
Index: 148
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: Who is {married|maritaldesc} in {software engineering|department} dept
Column: get_employees
Index: 152
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: who is {reports to|manager} {shared services manager|position}?
Column: get_employees
Index: 163
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: after
EN

ENTITY: department
SENTENCE: Is it true that {ivan|name} has a {six figure|sys_number} {salary|money}?
Column: get_salary
Index: 147
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What is the {every week|time_recur} {income|money} for {Rosalie Hutter|name} at Cisco?
Column: get_salary
Index: 149
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What is {Ivan Rogers|name}'s {annual|time_recur} {salary|money}?
Column: get_salary
Index: 155
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: Cisco
ENTITY: name
SENTENCE: How much {pay|money} does {Anthony Cisco|name} {earner|money} from Cisco {hourly|time_recur}?
Column: get_salary
Index: 158
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: cisco
ENTITY: name
SENTENCE: How much {pay|money} does {Anthony Cisco|name} {earner|money} from Cisco {hourly|time_recur}?
Column: get_salary
Index: 158
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: Cisco
ENTITY: name
SENTENCE: How much does Cisco give {Susan Ferguson|name} in {

SENTENCE: of all the {cio|position}s, what is the {min|extreme} {earnings|money}
Column: get_salary_aggregate
Index: 102
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What is the {average|function} {pay rate|money} for {executives|position}?
Column: get_salary_aggregate
Index: 103
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: among all of the employees that found their job from an {pay per click - google|employee_source}, what is the {typical|function} {earning|money}?
Column: get_salary_aggregate
Index: 107
POTENTIAL OPTION-MISSING LABEL
OPTION: sales
ENTITY: department
SENTENCE: fetch me the {bottom|extreme} {$|money} for a {area sales manager|position} at this organization
Column: get_salary_aggregate
Index: 110
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: What is the {sum|function} {income|money} of all the employees?
Column: get_salary_aggregate
Index: 111
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS


Column: get_date
Index: 78
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: {Catherine Ybarra|name}'s {bday|dob} is in which month
Column: get_date
Index: 85
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: Ait
ENTITY: name
SENTENCE: How long has {Karthikeyan Ait Sidi|name} worked here?
Column: get_date
Index: 93
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: ait
ENTITY: name
SENTENCE: How long has {Karthikeyan Ait Sidi|name} worked here?
Column: get_date
Index: 93
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: {Shana Maurice|name}'s {bday|dob} is in which month
Column: get_date
Index: 94
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: {mia brown|name}'s {bday|dob} is in which month
Column: get_date
Index: 96
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: started
ENTITY: employment_action
SENTENCE: What was {Brandon LeBlanc|name}s date when he started to {work here|employment_action}?
Column: get_date
Index: 105
POTENTIAL SYNONYM-M

POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: is {Charles Bozzi|name} the {mentor|manager} for {Brannon Miller|name}
Column: get_hierarchy
Index: 5
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IT
ENTITY: department
SENTENCE: is {Peter Monroe|name} {managing|manager} {Amy Dunn|name} or is it the other way around?
Column: get_hierarchy
Index: 6
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: is {Peter Monroe|name} {managing|manager} {Amy Dunn|name} or is it the other way around?
Column: get_hierarchy
Index: 6
POTENTIAL OPTION-MISSING LABEL
OPTION: other
ENTITY: employee_source
SENTENCE: is {Peter Monroe|name} {managing|manager} {Amy Dunn|name} or is it the other way around?
Column: get_hierarchy
Index: 6
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENTENCE: who is {helen billis|name}s {managing|manager} {supervisor|manager}?
Column: get_hierarchy
Index: 7
POTENTIAL SYNONYM-MISSING LABEL
SYNONYM: IS
ENTITY: department
SENT

In [15]:
#get_labels("What is the {manager|manager} name of {Julia|name}?")

In [25]:
ent_dict

{'name': [['William LaRotonda',
   'Tyrone Steans',
   'Estelle Howard',
   'Leigh Smith',
   'Brandon LeBlanc',
   'Sean Quinn',
   'Bonalyn Boutwell',
   'Amy Foster-Baker',
   'Janet King',
   'Jennifer Zamora',
   'Renee Becker',
   'Taisha Goble',
   'Daniff Hernandez',
   'Jayne Horton',
   'Noelle Johnson',
   'Thomas Murray',
   'Randall Pearson',
   'Thelma Petrowsky',
   'Lori Roby',
   'Jason Salter',
   'Kramer Simard',
   'Simon Roup',
   'Ricardo Ruiz',
   'Peter Monroe',
   'Eric Dougall',
   'Rick Clayton',
   'Lisa Galia',
   'Leonara Lindsay',
   'Alejandro Bacong',
   'Anthony Cisco',
   'Linda Dolan',
   'Maria Gonzalez',
   'Carlos Merlos',
   'Tanya Morway',
   'Anita Shepard',
   'Neville Tredinnick',
   'Jumil Turpin',
   'Karthikeyan Ait Sidi',
   'Claudia Carr',
   'Donald Favis',
   'Bianca Roehrich',
   'Ann Daniele',
   'Jyoti Lajiri',
   'Jeremiah Semizoglou',
   'Joe South',
   'Sarah Warfield',
   'Elisa Bramante',
   'Michael Albert',
   'Charles Bozzi'

<h4>Generate Gazetteers</h4>

In [26]:
gen_gazetteers(ent_dict)

<h4>Generate Mapping.json</h4>

In [27]:
gen_map_json_files(ent_dict)

<h4>Data Augmentation</h4>

In [83]:
def entity_swap(sentence, positions, uniq):
    chars = list(sentence)
    for i in reversed(range(len(positions))):
        pos = positions[i]
        ent = ''.join(chars[pos[0]:(pos[1] + 1)])
        excluded = [] #['age']
        #print("ENTITY FOUND")
        #print(ent)
        kv = get_kv([ent])
        #print(kv)       
        for i in range(len(kv[0])):
            if kv[1][i] in uniq and kv[1][i] not in excluded:
                new_ent = "{" + random.choice(uniq[kv[1][i]]) + "|" + kv[1][i] + '}'
                chars[pos[0]:(pos[1]+1)] = list(new_ent)
                #print(new_ent)
    #print("FINAL")
    return "".join(chars)

In [19]:
#entity_swap("What is the {manager|manager} name of {Julia|name}?", [[12, 28], [38, 49]], uniq)


In [28]:
#intent_txt['get_info'][6]

In [85]:
def data_augment(df, uniq):
    df = intent_txt
    for col in df:
        augment = []
        print(col.upper() + "=============================================================")
        idx = 2
        nan = is_nan(df[col][idx])
        l_dict = {}
        while(idx < len(df)):
                if(is_nan(df[col][idx])): break
                #print(idx)
                sentence = df[col][idx]
                labels, pos = get_labels(sentence)
                for i in range(3): augment.append(entity_swap(sentence, pos, uniq))
                idx += 1
        # Create files
        #augment = set(augment)
        #if((250 - idx) < len(augment)): augment = random.sample(augment, 250 - idx)
        print("Augmented Lines Generated: " + str(len(augment)))
        with open('data_augment/' + col + ".txt", 'w+') as filehandle:  
            filehandle.writelines("%s\n" % line for line in augment)

data_augment(intent_txt, uniq)

Augmented Lines Generated: 747
Augmented Lines Generated: 675
Augmented Lines Generated: 894
Augmented Lines Generated: 216
Augmented Lines Generated: 207
Augmented Lines Generated: 207
Augmented Lines Generated: 213
Augmented Lines Generated: 69
Augmented Lines Generated: 189
Augmented Lines Generated: 591
