In [1]:
import numpy as np
import pandas as pd
import json
from datetime import datetime, timedelta
from dateutil.parser import parse

KeyboardInterrupt: 

<h2>Functions</h2>

In [None]:
# Get the Names of Employees in the HR Dataset
# param df (dataframe) - Dataframe with names
# param column (str) - Column can be 'Employee Name' or 'Manager Name'
# Returns names (list) - Array of names (['First', 'Last'])
def df_get_names(df, column='Employee Name'):
    names = [df[[column]].iloc[i][0] for i in range(df.shape[0])]
    for i in range(len(names)):
        try:
            full_name = names[i].split(',')
            first = full_name[1].strip().split()[0]
            last = full_name[0]
            names[i] = [first, last]
        except:
            names[i] = ['Jeremy', 'Prater']
    return names

# Split the Name Column in the Dataframe
# param df (dataframe) - Dataframe with names
# Returns updated_df (dataframe) - Dataframe with a 'first_name' and 'last_name' column
def df_split_name_col(df):
    names = df_get_names(df, 'emp_name')
    for i in range(len(names)):
        df.at[i, 'first_name'] = names[i][0]
        df.at[i, 'last_name'] = names[i][1]
    return df

# Reformat Date from 'MM/DD/YY' to 'YYYY-MM-DD' for DOB, DOH, DOT
# param date (str) - Date to reformat in the form 'MM/DD/YYYY'
# param df (dataframe) - Dataframe with Dates
def reformat_date(date_str):
    date_split = date_str.split('/')
    if(int(date_split[2]) < 20): date_split[2] = '20' + date_split[2]
    else: date_split[2] = '19' + date_split[2]
    date_str = '/'.join(date_split)
    return datetime.strptime(date_str, '%m/%d/%Y').strftime('%Y-%m-%d')

# Reformat Date from 'MM/DD/YYYY' to 'YYYY-MM-DD' for DOB, DOH, DOT
# param df (dataframe) - Dataframe with Dates
# Returns updated_df (dataframe) - Dataframe with a 'first_name' and 'last_name' column
def df_reformat_date(df):
    cols = ['doh', 'dob', 'dot']
    for col in cols:
        for i in range(len(df)):
            if(df[col][i] != '1800-01-01' ): df.at[i, col] = reformat_date(df[col][i])
    return df

<h3>File Writing</h3>

In [None]:
# Convert a Knowledge Base Dataframe to JSON File
# param df (dataframe) -  Dataframe to write to file
def df_to_json_file(df):
    json_dict = df.to_dict(orient='records')
    with open("../hr_assistant/data/user_data.json", "w+") as f:
        json_str = json.dumps(json_dict, indent=4)
        f.write(json_str)

<h3>Knowledge Base Query Helpers</h3>

In [None]:
# Get a List of Names from a QA Result
# param qa_out (list) Output of QA from a query
def _get_names(qa_out):
    return [out['emp_name'] for out in qa_out]
  
# Filter the output of the Question Answerer
# param ent (str) Entity to filter on
# param val (str) Value to filter for
# param qa_out (list) List of Json Objects Representing Users
# Return qa_out_filtered (list) List if JSON Objects filtered by entity and value
def _categ_filter(ent, val, qa_out): 
    return [x for x in users if x[ent] == val]

# Convert Date object to Str Date Format 'YYYY-MM-DD'
# param date_obj (Datetime object) - Datetime object to convert
def _d_to_ymd(d): return d.strftime('%Y-%m-%d')

# Get the Datetime Str for a Certain number of Years, Months, Days ago from the present time
# param years (int) - Number of years ago 
# param months (int) - Number of months ago 
# param weeks (int) - Number of weeks ago
# param days (int) - Number of days ago 
# return date_obj (Datetime Object) - Datetime Object
def _get_ago_t(years=0, months=0, weeks=0, days=0):
    total_days = years*52*7 + months*4*7 + weeks*7 + days
    d = get_now_time() - timedelta(days=total_days)
    return d

# Get the Datetime Object for the current time
# return date_obj (Datetime Object) - Datetime Object
def _get_now_t(): return datetime.now()

# Get the Datetime Str for the current time
# return date_str (str) - Str of date in the format'YYYY-MM-DD'
def _get_now_t_str(): return d_to_ymd(get_now_time())


# Filter the output of the Question Answerer by Date
# param date_type (str) Date Type to Filter On: 'doh', 'dob', 'dot'
# param qa_out (list) List of Json Objects Representing Users
# param start_d (str) Start Date in the format 'YYYY-MM-DD'
# param end_d (str) End Date in the format 'YYYY-MM-DD'
# Return qa_out_filtered (list) List if JSON Objects filtered by Date Type
def _filter_by_d(date_type, qa_out, start_d='1900-01-01', end_d=get_now_t()):
    s = qa.build_search(index='user_data_3')
    return s.filter(field=date_type, lte=end_d, gte=start_d).execute()

# Filter the output of the Question Answerer by Date
# param d_type (str) Date Type to Filter On: 'doh', 'dob', 'dot'
# param qa_out (list) List of Json Objects Representing Users
# param gt (str) Greater than Start Date in the format 'YYYY-MM-DD'
# param gte (str) Greater than or equal to Start Date in the format 'YYYY-MM-DD'
# param lt (str) Greater than Start Date in the format 'YYYY-MM-DD'
# param lte (str) Greater than or equal to Start Date in the format 'YYYY-MM-DD'
# Return qa_out_filtered (list) List if JSON Objects filtered by Date Type
def _filter_by_d_custom(d_type, qa_out, gt, gte, lt, lte):
    # Less than (or equal to)
    if lt is not None:
        lt = _ymd_to_d(lt)
        qa_out = [x for x in qa_out if _ymd_to_d(x[d_type]) < lt]
    elif lte is not None:
        lte = _ymd_to_d(lte)
        qa_out = [x for x in qa_out if _ymd_to_d(x[d_type]) <= lte]
    # Greater than (or equal to)
    if gt is not None:
        gt = _ymd_to_d(gt)
        qa_out = [x for x in qa_out if _ymd_to_d(x[d_type]) > gt]
    elif gte is not None:
        gte = _ymd_to_d(gte)
        qa_out = [x for x in qa_out if _ymd_to_d(x[d_type]) >= gte]
    # Remove "Null" values in DOT that were replaced with "1800-01-01"
    if d_type == 'dot': qa_out = [x for x in qa_out if _ymd_to_d(x[d_type]) > _ymd_to_d("1800-01-01")] 
    return qa_out

# Function Helper that does Sum, Average and Percent Calculations
# param qa_out (list) List of Json Objects Representing Users
# param func (str) - Function Type: 'avg','sum', 'ct', 'pct'
# param num_col (str) - Numerical Column Type : 'money', or 'age'
# returns result (float) - Resulting Value from function operation
def _agg_function(qa_out, func='avg', num_col='money'):
    if(func=='avg'): return np.mean([emp[num_col] for emp in qa_out])
    elif(func=='sum'): return np.sum([emp[num_col] for emp in qa_out])
    elif(func=='ct'): return len(qa_out)
    elif(func=='pct'): return len(qa_out)/300

# Get the Salary Amount Based on a Recurring Period of Time
# param recur_ent (str): 'yearly', 'monthly', 'weely', 'daily', 'hourly'
# param money (float): Hourly Salary of an employee
def _get_interval_amount(recur_ent, money):
    intv_mult = { "yearly": 12*4*5*8, "monthly": 4*5*8, "weekly":5*8, "daily": 8,"hourly": 1}
    return round(intv_mult[recur_ent] * money, 2)         

<h2>Workflow</h2>

<h4>Create Json</h4>

In [None]:
kb = pd.read_csv('core_dataset.csv')
kb.fillna("1800-01-01", inplace=True)
kb = df_split_name_col(kb)
kb = df_reformat_date(kb)
kb

In [None]:
df_to_json_file(kb)

In [None]:
#kb['doh'][3]

<h4>Knowledge Base</h4>

In [None]:
from mindmeld.components import QuestionAnswerer
qa = QuestionAnswerer(app_path='hr_assistant')
qa.load_kb('hr_assistant', 'user_data_3', '../hr_assistant/data/user_data.json')

In [None]:
qa_out = qa.get(index='user_data_3', sex='Male')
qa_out

In [None]:
#qa._kb_field_info.get(field='dob')

In [None]:
s = qa.build_search(index='user_data_3')

In [None]:
s.filter(field='doh', lte='2019-01-01').execute()

In [None]:
qa_out = s.query(sex='Male').execute(size=300)

In [None]:
qa_out

In [None]:
qa_out[0]['doh']

In [None]:
war_start = '2011-01-03'
datetime.strptime(war_start, '%Y-%m-%d')

In [None]:
_ymd_to_d(qa_out[0]['doh'])

In [None]:
# for user in qa_out:
#     print(datetime.strptime((user['doh'])))

In [None]:
#[x for x in qa_out if _ymd_to_d(x['dob']) > _ymd_to_d('1990-01-01')]

In [None]:
date_str = qa_out[0]['dob']

In [None]:
_ymd_to_d(date_str) == _ymd_to_d(date_str)

In [None]:
datetime.strptime("1800-01-01", '%Y-%m-%d')

In [None]:
_get_interval_amount("daily", 32.4545)

In [None]:
_filter_by_d_custom(d_type='dot', qa_out=qa_out, gt="1980-01-01", gte=None, lt=None, lte="2020-01-01")

In [None]:
len(s.query(sex="Female", size=300).execute() )  #.filter(field="age", lte='100', size=300).filter(field="money", gte=10, size=300).execute())

In [None]:
users = qa.get(index='user_data_3', sex='Female',size=301)

In [None]:
qa.get(index='user_data_3', sex='Female',size=301)

In [None]:
s = qa.build_search(index='user_data_3')
len(s.query(sex='Male', size=300).execute())

In [None]:
users

In [None]:
qa_out = qa.get(index='user_data', sex='Male')
qa_out

In [None]:
s = qa.build_search(index='user_data_3')

In [None]:
len(s.filter(field="age", lt='50').execute(size=300))

In [None]:
np.sum([emp['money'] for emp in qa_out])

In [None]:
get_now_t_str()

In [None]:
d_to_ymd(get_ago_t(years=5))

In [None]:
users = qa.get(index='user_data_3',size=301)
users

In [None]:
filter_qa_out('sex','Male', users)

In [None]:
Also, entering the size parameter on the execute function means that we will not be able to add on another filter after the execute. 