In [2]:
import numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
pd.pandas.set_option('display.max_columns',None)

In [9]:
df = pd.read_csv('uploads/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [3]:
df.shape

(1470, 35)

## Removing constant values

In [1]:
preprocess_dict = dict()

In [3]:
preprocess_dict['rem_constant_features'] = ['EmployeeCount','EmployeeNumber','Over18','StandardHours']

# Encoding

In [4]:
preprocess_dict['Attrition'] = {'Yes':1,'No':0}

In [5]:
preprocess_dict['BusinessTravel'] = {
    'Travel_Frequently' : 2,
    'Travel_Rarely' : 1,
    'Non-Travel' : 0
}

In [7]:
preprocess_dict['onehot_encoded_features'] = ['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']

In [17]:
onehot_cols = pd.get_dummies(df[['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']],drop_first=True).columns.values

onehot_cols_dict = dict()

for col in onehot_cols:
    splitted = col.split('_')
    if(splitted[0] not in onehot_cols_dict):
        onehot_cols_dict[splitted[0]] = []
    onehot_cols_dict[splitted[0]].append(splitted[1])
onehot_cols_dict

{'Department': ['Research & Development', 'Sales'],
 'EducationField': ['Life Sciences',
  'Marketing',
  'Medical',
  'Other',
  'Technical Degree'],
 'Gender': ['Male'],
 'JobRole': ['Human Resources',
  'Laboratory Technician',
  'Manager',
  'Manufacturing Director',
  'Research Director',
  'Research Scientist',
  'Sales Executive',
  'Sales Representative'],
 'MaritalStatus': ['Married', 'Single'],
 'OverTime': ['Yes']}

In [25]:
onehot_dropped_cols = dict()

for col in ['Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']:
    onehot_dropped_cols[col] = set(df[col].unique())-set(onehot_cols_dict[col])
    
onehot_dropped_cols

{'Department': {'Human Resources'},
 'EducationField': {'Human Resources'},
 'Gender': {'Female'},
 'JobRole': {'Healthcare Representative'},
 'MaritalStatus': {'Divorced'},
 'OverTime': {'No'}}

In [26]:
preprocess_dict['onehot_dropped_col_dict'] = onehot_dropped_cols

## Save Preprocessing Dict

In [28]:
import pickle
pickle.dump(preprocess_dict, open('preprocess_dict', 'wb'))

## Example for preprocessing

In [31]:
df = pd.read_csv('Ex_Data/Ex.csv')

In [32]:
df = df.drop(labels=preprocess_dict['rem_constant_features'],axis=1)

In [35]:
df['BusinessTravel'] = df['BusinessTravel'].map(preprocess_dict['BusinessTravel'])

In [39]:
df1 = pd.get_dummies(df,columns=preprocess_dict['onehot_encoded_features'])

In [44]:
cols_to_be_dropped = []
for col in preprocess_dict['onehot_encoded_features']:
    item = col+'_'+list(preprocess_dict['onehot_dropped_col_dict'][col])[0]
    cols_to_be_dropped.append(item)
cols_to_be_dropped

['Department_Human Resources',
 'EducationField_Human Resources',
 'Gender_Female',
 'JobRole_Healthcare Representative',
 'MaritalStatus_Divorced',
 'OverTime_No']

In [48]:
df1 = df1.drop(labels=cols_to_be_dropped,axis=1)

In [49]:
df1.shape

(500, 43)