## Hackathon to predict whether an employee is promoted
https://datahack.analyticsvidhya.com/contest/wns-analytics-hackathon-2018/

In [15]:
import pandas as pd
import category_encoders as ce
import os as os
import numpy as np

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

plt.style.use('bmh')

In [38]:
working_dir = !pwd
working_dir = working_dir[0]
print(working_dir)

C:\Users\aruna\Documents\GitHub\datascience\hackathons\WNS-promotion-prediction


In [52]:
#reading the data
train_df = pd.read_csv(os.path.join(working_dir, "train.csv"))
print('training set shape:', train_df.shape)

training set shape: (54808, 14)


In [54]:
train_df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [55]:
#missing values are filled with 'missing' for categorical values
train_df[['education', 'department', 'region', 'gender', 'recruitment_channel']] = train_df[['education', 'department', 'region', 'gender', 'recruitment_channel']].fillna(value='missing')

## previous_year_rating is missing for all length_of_service=0 rows, may be new joinees. Assume they perform average 3        
train_df[['previous_year_rating']] = train_df[['previous_year_rating']].fillna(3)

#'employee_id is not useful'
train_df.drop(['employee_id'], axis=1, inplace=True)
train_df.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [56]:
#add features calculated from combination of other feacutes
train_df['work_fraction'] = train_df['length_of_service'] / train_df['age']
train_df['start_year'] = train_df['age'] - train_df['length_of_service']
train_df['avg_training_score_scaled_mean_department_region'] = train_df['avg_training_score']/train_df.groupby(['department', 'region'])['avg_training_score'].transform('mean')

In [59]:
#deal with categorical variables
#excellent article https://towardsdatascience.com/7-data-types-a-better-way-to-think-about-data-types-for-machine-learning-939fae99a689

## transformation for education level is ordinal, rest categories are nominal
ce_edulevel = ce.OrdinalEncoder(mapping=[{
    "col":"education",    
    "mapping": [
        ('Master\'s & above',4), 
        ('Bachelor\'s',3), 
        ('missing',2), 
        ('Below Secondary',1)        
    ]}
])

## transforming region, department and recruitment channel as one hot
ce_target = ce.TargetEncoder(cols=['department', 'region', 'recruitment_channel', 'gender'], drop_invariant=True)

In [60]:
X = train_df.drop(['is_promoted'], axis=1)
Y = train_df['is_promoted']

In [61]:
X = ce_edulevel.fit_transform(X)
X = ce_target.fit_transform(X, Y)

In [62]:
X.head()

Unnamed: 0,department,region,education,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,work_fraction,start_year,avg_training_score_scaled_mean_department_region
0,0.072031,0.106546,4,1,35,5.0,8,1,0,49,0.228571,27,0.973555
1,0.090148,0.114188,3,1,30,5.0,4,0,0,60,0.133333,26,0.992765
2,0.072031,0.060641,3,1,34,3.0,7,0,0,50,0.205882,27,0.997331
3,0.072031,0.116596,3,2,39,1.0,10,0,0,50,0.25641,29,0.992011
4,0.107593,0.063274,3,1,45,3.0,2,0,0,73,0.044444,43,0.91247


In [63]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [68]:
train_X,test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.20, random_state=101)

model = LogisticRegressionCV(cv=5,penalty='l2', Cs=10,max_iter=5000, n_jobs = -1)
model.fit(train_X,train_Y)
prediction=model.predict(test_X)

The accuracy of the Logistic Regression is 0.9322203977376391


In [75]:
print('The accuracy of the Logistic Regression: ',metrics.accuracy_score(prediction,test_Y))
print('The AUC of Logistic Regression is: ',metrics.roc_auc_score(prediction,test_Y))

The accuracy of the Logistic Regression:  0.9322203977376391
The AUC of Logistic Regression is:  0.8802111316212561


In [77]:
print('Random guess accuracy:',test_Y.value_counts(normalize=True))

Random guess accuracy: 0    0.915891
1    0.084109
Name: is_promoted, dtype: float64


## We can see logistic regression increased the accuracy by about 2%
Next is to try random forests