In [1]:
import numpy as np
import pandas as pd
import math
import random
import os
import re

In [2]:
# Check if a Value is Nan
# param x - Value to check
# return boolean - If it is a Nan
def is_nan(x):
    return (x is np.nan or x != x)

In [19]:
# Create Training and Test Text Files with an Array
# param folder (str) - folder to store txt files
# param sentences (arr) - array of sentences
# param train_pct (int) - Percent of lines to use for training
def create_txt(folder, sentences, train_pct):
    np.random.shuffle(sentences)
    # Separate
    train_ct = math.floor(len(sentences)*train_pct)
    #print("Creating Text Files for " + folder)
    #print("Train Count: " + str(train_ct))
    #print("Test Count: " + str(len(sentences) - train_ct))
    train_data = sentences[:train_ct]
    test_data = sentences[train_ct:]
    # Create Directory
    directory = '../hr_assistant_categories/domains/hr_assistant/' + folder + '/'
    if not os.path.exists(directory): os.makedirs(directory)
    # Create File
    with open(directory + 'train.txt', 'w') as filehandle:  
        filehandle.writelines("%s\n" % train_line for train_line in train_data)
    with open(directory + 'test.txt', 'w') as filehandle:  
        filehandle.writelines("%s\n" % test_line for test_line in test_data)

In [4]:
# Scan to Make Sure that Entities have been labelled correctly
# param sentence (str) - Sentence to be scanned
def scan_punctuation(sentence):
    if(not is_nan(sentence)): 
        if(sentence.count('{') != sentence.count('}') or sentence.count('|') < sentence.count('{')):
            return True

In [5]:
# Scan sentence for names
# param sentence (str) - Sentence to be scanned
def scan_names(sentence):
    if(not is_nan(sentence)):
        users = ['mia brown', 'Mia Brown', 'ivan rogers', 'Ivan Rogers', 'julia soto','Julia Soto',
                 'nan singh', 'Nan Singh', 'mia','Mia','ivan', 'Ivan', 'julia', 'Julia','nan', 'Nan'] 
        updated = False
        idx = 0
        while(not updated and idx < len(users)):
            replace = random.choice(users)
            new = re.sub(users[idx], replace, sentence)
            if(sentence != new):
                sentence = new
                updated = True
            idx += 1
        return sentence
    

In [17]:
# Create Text Files From CSV
# Automatically generates the Domains Folder
# param df (dataframe) - Dataframe where each column is an intent and each row has sentence examples
# train_pct (float) - Percentage of Examples to be used for training
def df_gen_txt_files(df, train_pct):
    for col in df:
        print(col)
        idx = 2
        sentences = []
        nan = is_nan(df[col][idx])
        while(idx < len(df)):
            if(is_nan(df[col][idx])): break
            nan = is_nan(df[col][idx])
            sentences.append(df[col][idx])
            idx += 1

        for i in range(len(sentences)):
            if(len(sentences[i].split()) == 0): print("FLAG")
            if(scan_punctuation(sentences[i])): print("MISMATCH DETECTED: " + sentences[i])
            sentences[i] = scan_names(sentences[i])
                
        create_txt(col,sentences, train_pct)
        #print("Sentences generated for " + col + "||  Line count: " + str(len(sentences)) )


In [7]:
intent_txt = pd.read_csv('HR Manager Schema - intent_master.csv')
intent_txt = intent_txt.iloc[2:, :]
intent_txt

Unnamed: 0,get_info,get_aggregate,get_employees,get_salary,get_salary_aggregate,get_salary_employees,get_date,get_date_range_aggregate,get date_range_employees,get_hierarchy
2,What is {nan|name}'s race?,What is the {total|function} {number of|functi...,Give me employees who are {single|maritaldesc},What is {Mia|name}'s {pay|money}?,What is the {median|function} {pay|money} of {...,Which employee(s) have {lowest|extreme} {incom...,What is the {date of hiring|employment_action}...,What {percentage|function} of employees were {...,Give me a list of people {hired|employment_act...,Who is {Mia|name}'s {manager|manager}?
3,Is {Michael|name} {married|maritaldesc}?,What {percent|function} of employees {exceeded...,All employees from {MA|state},Tell me who earned the {least|extreme} that wa...,what {percentage|function} of employees {make|...,who is the {highest|extreme} {earning|money} {...,When did {Amy|name} {join|employment_action} t...,What {percent|function} of employees were {hir...,Tell me about employees who {started|employmen...,Which employees have {Julia|name} as their {ma...
4,What is {Nan|name}'s official position?,What is the {percentage|function} of new grads...,Which employees have been recently {terminated...,What is the {pay rate|money} of {Julia|name}?,What is the {average|function} {pay rate|money}?,For employees {hired|employment_action} {betwe...,How long has {Ivan|name} been with the company?,What {percentage|function} of employees were {...,Which employees were not yet {born|dob} when {...,What is the name of {Julia|name}'s {manager|ma...
5,did {Nan|name} hear about us through {Glassdoo...,What is the {average|function} {age|age}?,Which employees are not {US citizens|citizende...,What is {Nan|name}'s {pay rate|money}?,What is the {average|function} {pay|money} of ...,what are the {salaries|money} for employees th...,When was {Nan|name} {fired|employment_action}?,What {percent|function} of all our employees w...,list the employees who {joined|employment_acti...,Who is {Michael|name}'s {manager|manager}?
6,give me {Nan|name}'s race please,What {percent|function} of employees are manag...,Which employees were {let go|employment_action...,How much does {Michael|name} {make|money}?,Calculate the {average|function} {pay rate|mon...,what are our {top|extreme} {earners|money} {ma...,How long was {Mia|name} working for?,What {percentage|function} of employees were {...,Which employess were {hired|employment_action}...,Who is {John Reeder|name}'s {manager|manager}?
7,What is {Mia|name}'s employment status?,What's the {average|function} {age of|age} emp...,{managers|position},Is {Mia|name} being {paid|money} {$|money}{40k...,{average|function} {pay rate|money} for {women...,{below|comparator} {average|function} {earning...,When was {Michael|name}'s {date of hire|employ...,{How many|function} employees were {hired|empl...,Which employees were {hired|employment_action}...,Who is {Mia Brown|name}'s {manager|manager}?
8,Is {Ivan|name} from out of state?,What {percentage|function} of the employees ar...,Which employees have been {terminated|employme...,Does {Mia|name} get {$|money}{70k|sys_number} ...,What's the {average|function} {pay rate|money}?,Tell me who all are {making|money} {more than|...,What is {Nan|name}'s {date of birth|dob}?,{How many|function} people were based out of {...,Who worked for Cisco for {less than|comparator...,Who is the {manager|manager} for {Bob|name}
9,Does {Michael|name} {still work at|employment_...,What {percentage|function} of employees are {e...,Which employees have been with the company lon...,When we let {Nan|name} go {fired|employment_ac...,What is the {average|function} {pay rate|money...,Which {software engineers|position} are {paid|...,What year was {Mia|name} {hired|employment_act...,What {percentage|function} of employees have b...,Which employees have been {hired|employment_ac...,What is {Mia|name}'s {manager|manager}'s name?
10,What is {Mia|name}'s {performance score|perfor...,What {percentage|function} of employees are {f...,Which employees have gotten only {positive fee...,{pay rate|money} of {Mia|name},{How many|function} employees are paid {above|...,"Of all the {Production Managers|position}, whi...",When did we {fire|employment_action} {Jeff|name}?,{How many|function} employees were {born|dob} ...,Which employee was {hired|employment_action} w...,Who {reports|manager} into {Nan Singh|name}
11,What position is {Julia|name} in?,{Percentage|function} of Employees in departme...,Which employees have a {spouse|maritaldesc}?,how much {money|money} does {Mia|name} make?,what {number of|function} people {earn|money} ...,above {average|function} {earning|money} emplo...,How long has {Mia|name} worked here?,{How many|function} people have {worked here|e...,Which employees have {been with us|employment_...,can i have the names of employees who report t...


In [20]:
df_gen_txt_files(intent_txt, 0.8)

get_info
get_aggregate
get_employees
get_salary
get_salary_aggregate
get_salary_employees
get_date
get_date_range_aggregate
get date_range_employees
get_hierarchy
