In [2]:
import numpy as np
import pandas as pd
import math
import random
import os
import re

In [3]:
# Check if a Value is Nan
# param x - Value to check
# return boolean - If it is a Nan
def is_nan(x):
    return (x is np.nan or x != x)

In [4]:
# Create Training and Test Text Files with an Array
# param folder (str) - folder to store txt files
# param sentences (arr) - array of sentences
# param train_pct (int) - Percent of lines to use for training
def create_txt(folder, sentences, train_pct, domain='hr_assistant'):
    np.random.shuffle(sentences)
    # Separate
    train_ct = math.floor(len(sentences)*train_pct)
    #print("Creating Text Files for " + folder)
    #print("Train Count: " + str(train_ct))
    #print("Test Count: " + str(len(sentences) - train_ct))
    train_data = sentences[:train_ct]
    test_data = sentences[train_ct:]
    # Create Directory
    directory = '../hr_assistant/domains/' + domain + '/' + folder + '/'
    if not os.path.exists(directory): os.makedirs(directory)
    # Create File
    with open(directory + 'train.txt', 'w') as filehandle:  
        filehandle.writelines("%s\n" % train_line for train_line in train_data)
    with open(directory + 'test.txt', 'w') as filehandle:  
        filehandle.writelines("%s\n" % test_line for test_line in test_data)

In [5]:
# Scan to Make Sure that Entities have been labelled correctly
# param sentence (str) - Sentence to be scanned
def scan_punctuation(sentence):
    if(not is_nan(sentence)): 
        if(sentence.count('{') != sentence.count('}') or sentence.count('|') < sentence.count('{')):
            return True

In [6]:
# Scan sentence for names
# param sentence (str) - Sentence to be scanned
def scan_names(sentence):
    if(not is_nan(sentence)):
        users = ['mia brown', 'Mia Brown', 'ivan rogers', 'Ivan Rogers', 'julia soto','Julia Soto',
                 'nan singh', 'Nan Singh', 'mia','Mia','ivan', 'Ivan', 'julia', 'Julia','nan', 'Nan'] 
        updated = False
        idx = 0
        while(not updated and idx < len(users)):
            replace = random.choice(users)
            new = re.sub(users[idx], replace, sentence)
            if(sentence != new):
                sentence = new
                updated = True
            idx += 1
        return sentence
    

In [7]:
# Create Text Files From CSV
# Automatically generates the Domains Folder
# param df (dataframe) - Dataframe where each column is an intent and each row has sentence examples
# train_pct (float) - Percentage of Examples to be used for training
def df_gen_txt_files(df, train_pct):
    for col in df:
        print(col)
        idx = 2
        sentences = []
        nan = is_nan(df[col][idx])
        while(idx < len(df)):
            if(is_nan(df[col][idx])): break
            nan = is_nan(df[col][idx])
            sentences.append(df[col][idx])
            idx += 1

        for i in range(len(sentences)):
            if(len(sentences[i].split()) == 0): print("NAN FLAG")
            if(scan_punctuation(sentences[i])): print("MISMATCH DETECTED: " + sentences[i])
            sentences[i] = scan_names(sentences[i])
                
        gen_folder = 'general'
        if "date" in col: gen_folder='date'
        elif "salary" in col: gen_folder="salary"
        elif "hierarchy" in col: gen_folder="hierarchy"
    
        create_txt(col,sentences, train_pct, gen_folder)
        print("Sentences generated for " + col + "||  Line count: " + str(len(sentences)) )


In [20]:
intent_txt = pd.read_csv('HR Manager Schema - intent_master.csv')
intent_txt = intent_txt.iloc[2:, :]
intent_txt

Unnamed: 0,get_info,get_aggregate,get_employees,get_salary,get_salary_aggregate,get_salary_employees,get_date,get_date_range_aggregate,get_date_range_employees,get_hierarchy
2,what is {Phylicia Gosciminski|name}s org role,{percent|function} employees {below|comparator...,{female|sex} employees,Amount that {Julia|name} gets {paid|money},among all of the employees that found their jo...,Which {Sr. DBA|position} {earns|money} the {mo...,What year was {Lily DiNocco|name} {let go|empl...,{percent|function} of employees {born|dob} in ...,I want {male|sex} {born|dob} in the {1930s|tim...,can i have the names of employees who report t...
3,What position is {ivan|name} in?,{count|function} of workers are {less than|com...,employees {hispanic|racedesc}?,{joanne handschiegl|name} {each month|time_rec...,{sum|function} {pay|money} for {female|sex}?,Which employee(s) have {lowest|extreme} {incom...,Has {Sarah Warfield|name} been working here fo...,{1974|sys_time} {born|dob} employees {percent|...,Which employees did we {get rid of|employment_...,is {Charles Bozzi|name} the {mentor|manager} f...
4,Why did {Megan|name} get {fired|employment_act...,{How many|function} employees are {C-levels|po...,get me the {youngest|extreme} {five|sys_number...,{Mia|name} {earns|money} what amount {each day...,give me the {mean|function} {salary|money} for...,give me the {earners|money} for all of the emp...,What was the exact date when {desiree|name} wa...,What {percentage|function} of employees were {...,{forties|time_interval} {born|dob} employees w...,is {Peter Monroe|name} {managing|manager} {Amy...
5,Which department is {adrienne homberger|name} in?,Gimme the {percent|function} of {50|age} year ...,give me a list of {separated employees|marital...,What does {mia|name}'s {paycheck|money} look l...,what is the {highest|extreme} {amount|money} t...,give me the {earnings|money} for all of the em...,How long has Mr.{Knapp|name} worked here?,{1945|sys_time} {born|dob} employees {percent|...,Which are the employees such that in {2005|sys...,who is {helen billis|name}s {managing|manager}...
6,is {abdellah veera|name} a {cio|position} or not?,{How many|function} people are {performing bad...,employees that live in {california|state},{webster|name} is {earning|money} what amount ...,What is the {total|function} {earnings|money} ...,what are {network engineers|position} {making|...,Has {Nicole|name} been working here for {4 yea...,What {percent|function} of employees were {hir...,i want the employees that have been {hired |em...,which employees is {Ivan singh|name} the {mana...
7,Is {Mohammed Latif|name} a citizen of the us?,{cumulative|function} {count|function} of empl...,Which employees have been {terminated|employme...,"Does {54,000|sys_number} exceed what {jessica|...",what are {women|sex} {making|money} on {averag...,all the {earnings|money} of {female|sex} in th...,What was the date when {ivan rogers|name} was ...,What {percentage|function} of employees have b...,Give me the employees that have a {join date|e...,Who are those employees that are {under|compar...
8,Where does {ivan rogers|name} live?,{average|function} age of the employees that a...,Who {has worked here|employment_action} based ...,{ivan|name} {Salary|money} {Yearly|time_recur},"of all the {sales manager|position}s, what is ...",who {makes|money} the {minimum|extreme} {incom...,When {Sophia Theamstern|name} was {hired|emplo...,can you please tell me what {fraction|function...,Fetch me a list of workers that have their {bi...,I want to know if {Amy Dunn|name} is a {manger...
9,I want {sarah warfield|name}'s state,What's the {summed|function} {num of|function}...,give me a list of employees that are based in ...,What does {Rose Ivey|name} get for {income|mon...,get me the {average|function} amount that the ...,Get me the {lowest|extreme} {six|sys_number} {...,Fetch me {Francesco Barone|name}'s {Bday|dob},What {pct|function} of our staff have a {bday|...,get me {senior database admins|position} {born...,who is the {manager|manager} assigned to {luis...
10,how did {dawn|name} hear about our corporation,What is the {total|function} {number of|functi...,Which employees have been with the company lon...,"According to the {payroll|money}, how much doe...",{number of|function} people {earning|money} {f...,which employees are {making|money} {less than|...,{Leigh Smith|name} {date of birth|dob},What {percentage|function} of employees were {...,I want all of the employees in the {sales depa...,I want to know if {Sam Athwal|name} {works for...
11,Does {Michael|name} {still work at|employment_...,{average|function} age of workers who are {old...,employees are {under|comparator} {45|sys_numbe...,What is {Brooke oliver|name}'s {each year|time...,give me the {typical|function} take home {sala...,{non-citizen|citizendesc} {paycheck|money}s,{Ashley Rose|name} {birthday|dob},I want the {total|function} {number of|functio...,Can you tell me whether there are any {June|sy...,{Jenna Dietrich|name} is the {supervisor|manag...


In [21]:
df_gen_txt_files(intent_txt, .9)

get_info
Sentences generated for get_info||  Line count: 252
get_aggregate
Sentences generated for get_aggregate||  Line count: 226
get_employees
Sentences generated for get_employees||  Line count: 297
get_salary
Sentences generated for get_salary||  Line count: 279
get_salary_aggregate
Sentences generated for get_salary_aggregate||  Line count: 182
get_salary_employees
Sentences generated for get_salary_employees||  Line count: 173
get_date
Sentences generated for get_date||  Line count: 208
get_date_range_aggregate
Sentences generated for get_date_range_aggregate||  Line count: 66
get_date_range_employees
Sentences generated for get_date_range_employees||  Line count: 184
get_hierarchy
Sentences generated for get_hierarchy||  Line count: 200
