In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder

In [2]:
def load_data():
    return {
        'train': pd.read_csv('./data/train.csv'),
        'test': pd.read_csv('./data/test.csv')
    }

dataset = load_data()

dataset['train'].head(3)

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not
0,YAK/S/00001,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,2,1986,12.5,2011,1,0,41,ANAMBRA,No,Married,No,No,0,0
1,YAK/S/00002,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1991,12.5,2015,0,0,52,ANAMBRA,Yes,Married,No,No,0,0
2,YAK/S/00003,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,2,1987,7.5,2012,0,0,42,KATSINA,Yes,Married,No,No,0,0


In [3]:
data = dataset['train'].copy()

def add_features(data):
    data.loc[data['Qualification'].isnull(), 'Qualification'] = 'Special'

    data.loc[:, 'Age'] = 2019 - data['Year_of_birth']
    data.loc[:, 'work_duration'] = 2019 - data['Year_of_recruitment']
    return data

data = add_features(data)

data.loc[:, 'No_of_previous_employers'].replace({'More than 5': 6}, inplace=True)

data.loc[:, 'EmployeeNo'] = data.loc[:, 'EmployeeNo'].apply(lambda x: x[-5:]).astype(int)

In [4]:
data.columns

Index(['EmployeeNo', 'Division', 'Qualification', 'Gender',
       'Channel_of_Recruitment', 'Trainings_Attended', 'Year_of_birth',
       'Last_performance_score', 'Year_of_recruitment', 'Targets_met',
       'Previous_Award', 'Training_score_average', 'State_Of_Origin',
       'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action',
       'Previous_IntraDepartmental_Movement', 'No_of_previous_employers',
       'Promoted_or_Not', 'Age', 'work_duration'],
      dtype='object')

Let's do some encoding:

In [5]:
label_encode = [
    'Division', 
    'Qualification', 
    'Gender', 
    'Channel_of_Recruitment',
    'State_Of_Origin',
    'Foreign_schooled',
    'Marital_Status',
    'Past_Disciplinary_Action',
    'Previous_IntraDepartmental_Movement'
]

def replace_with_label_encodings(data, cols):
    for col_name in cols:
        print('encoding {}'.format(col_name))
        col = data.loc[:, col_name]
        enc = LabelEncoder()
        enc.fit(col)
        data.loc[:, col_name] = enc.transform(col)
    return data

data = replace_with_label_encodings(data, label_encode)

encoding Division
encoding Qualification
encoding Gender
encoding Channel_of_Recruitment
encoding State_Of_Origin
encoding Foreign_schooled
encoding Marital_Status
encoding Past_Disciplinary_Action
encoding Previous_IntraDepartmental_Movement


In [6]:
data.head(3)

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,...,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers,Promoted_or_Not,Age,work_duration
0,1,1,1,0,1,2,1986,12.5,2011,1,...,41,3,0,0,0,0,0,0,33,8
1,2,2,0,1,0,2,1991,12.5,2015,0,...,52,3,1,0,0,0,0,0,28,4
2,3,1,0,1,1,2,1987,7.5,2012,0,...,42,20,1,0,0,0,0,0,32,7


In [7]:
y = pd.DataFrame(data.loc[:, 'Promoted_or_Not'])
x = data.drop('Promoted_or_Not', axis=1)

os.makedirs('./data/processed', exist_ok=True)
y.to_csv('./data/processed/y.csv', index=False)
x.to_csv('./data/processed/x.csv', index=False)

assert y.shape[0] == x.shape[0]

In [8]:
y.describe()

Unnamed: 0,Promoted_or_Not
count,38312.0
mean,0.084595
std,0.278282
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0
