In [1]:
import numpy as np
import pandas as pd
import os
import json
import pprint

In [2]:
dataset_name = 'employee_attrition'

In [3]:
input_dir = './raw'
output_dir = './processed'

inp_fname = 'employee_attrition.csv'

outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')
outp_train_fname = os.path.join(output_dir, f'{dataset_name}_train.csv')
outp_test_fname = os.path.join(output_dir, f'{dataset_name}_test.csv')
outp_test_key_fname = os.path.join(output_dir, f'{dataset_name}_test_key.csv')
outp_infer_instances = os.path.join(output_dir, f'{dataset_name}_infer_req.json')

# Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary,left
0,0.38,0.53,2,157,3,0,0,sales,low,1
1,0.8,0.86,5,262,6,0,0,sales,medium,1
2,0.11,0.88,7,272,4,0,0,sales,medium,1
3,0.72,0.87,5,223,5,0,0,sales,low,1
4,0.37,0.52,2,159,3,0,0,sales,low,1


In [5]:
id_col = "id"
target_col = "left"

# Shuffle Data

In [6]:
# shuffle data
data = data.sample(frac=1, random_state=42)
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,department,salary,left
6723,0.65,0.96,5,226,2,1,0,marketing,medium,0
6473,0.88,0.8,3,166,2,0,0,IT,low,0
4679,0.69,0.98,3,214,2,0,0,sales,low,0
862,0.41,0.47,2,154,3,0,0,sales,low,1
7286,0.87,0.76,5,254,2,1,0,hr,low,0


# Insert Id Column

In [7]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())

data[id_col] = data[id_col].astype(str)

      id  satisfaction_level  last_evaluation  number_project  \
6723   0                0.65             0.96               5   
6473   1                0.88             0.80               3   
4679   2                0.69             0.98               3   
862    3                0.41             0.47               2   
7286   4                0.87             0.76               5   

      average_montly_hours  time_spend_company  Work_accident  \
6723                   226                   2              1   
6473                   166                   2              0   
4679                   214                   2              0   
862                    154                   3              0   
7286                   254                   2              1   

      promotion_last_5years department  salary  left  
6723                      0  marketing  medium     0  
6473                      0         IT     low     0  
4679                      0      sales     low     0 

# Save Main Data File

In [8]:
data.to_csv(outp_fname, index=False)

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split
test_size = 0.1

data_train, data_test = train_test_split(data, test_size=test_size, random_state=42)
print(data_train.shape, data_test.shape)

data_train.to_csv(outp_train_fname, index=False)
data_test.drop(columns=[target_col]).to_csv(outp_test_fname, index=False)
data_test[[id_col, target_col]].to_csv(outp_test_key_fname, index=False)

(13499, 11) (1500, 11)


# JSON inference request instance

In [10]:
instance = data_test.replace({np.nan:None}).drop(columns=[target_col]).reset_index(drop=True).loc[0].to_dict()
infer_req_instance_dict = {  "instances": [ {**instance}, ] }
pprint.pprint(infer_req_instance_dict)
   
with open(outp_infer_instances, 'w', encoding='utf8') as f:
    json.dump(infer_req_instance_dict, f, indent=2, ensure_ascii=False)

{'instances': [{'Work_accident': 0,
                'average_montly_hours': 223,
                'department': 'technical',
                'id': '6723',
                'last_evaluation': 0.91,
                'number_project': 3,
                'promotion_last_5years': 0,
                'salary': 'low',
                'satisfaction_level': 0.58,
                'time_spend_company': 2}]}
