In [None]:
!wget https://datahack-prod.s3.amazonaws.com/train_file/train_LZdllcl.csv -O train.csv

!wget https://datahack-prod.s3.amazonaws.com/test_file/test_2umaH9m.csv -O test.csv

!wget https://datahack-prod.s3.amazonaws.com/sample_submission/sample_submission_M0L0uXE.csv -O sample_submission.csv

In [None]:
# Import the required packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns

In [None]:
# Read the train and test data
train=pd.read_csv("train.csv")
train.drop('employee_id',inplace=True,axis = 1)
test=pd.read_csv("test.csv")

In [None]:
# Check the variables in train data
train.columns

In [None]:
# Print datatype of each variable
train.dtypes

In [None]:
# Dimension of the train dataset
train.shape

In [None]:
# Print the head of train dataset
train.head()

In [None]:
# Unique values in each variable of train dataset
train.nunique()

### Univariate Analysis

#### Target Variable

In [None]:
train['is_promoted'].value_counts(normalize=True)

In [None]:
# Around 91% trainee have promoted
# Unbalanced dataset 

#### Categorical Independent Variables

In [None]:
plt.figure(1)
plt.subplot(221)
train['department'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Department')

plt.subplot(222)
train['awards_won?'].value_counts(normalize=True).plot.bar(title= 'Awards won')

plt.subplot(223)
train['education'].value_counts(normalize=True).plot.bar(title= 'Education')

plt.subplot(224)
train['gender'].value_counts(normalize=True).plot.bar(title= 'Gender')

plt.show()

In [None]:
# Most of the trainee are enrolled for Y and T program_type.
# More number of trainee enrolment for offline test than online test.
# Most of the test are easy in terms of difficulty level.

In [None]:
train['KPIs_met >80%'].value_counts(normalize=True).plot.bar(title= 'KPI met greater than 80')


In [None]:
plt.figure(1)
plt.subplot(221)
train['region'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Region')

plt.subplot(222)
train['recruitment_channel'].value_counts(normalize=True).plot.bar(title='Recruitment Channels')

plt.subplot(223)
train['no_of_trainings'].value_counts(normalize=True).plot.bar(title= 'No of Trainings')

plt.subplot(224)
train['previous_year_rating'].value_counts(normalize=True).plot.bar(title= 'Previous year ratings')

plt.show()

In [None]:
# More male trainee as compared to female trainee
# Most of the trainee have diploma
# Most of the trainee belongs to tier 3 city
# 10% of the trainee are handicapped

#### Numerical Independent Variables

In [None]:
sns.distplot(train['age']);

In [None]:
# Most of the trainee are in the age range of 20-30 and 40-50

In [None]:
sns.distplot(train['length_of_service']);

In [None]:
sns.distplot(train['avg_training_score']);

### Bivariate Analysis

In [None]:
# Correlation between numerical variables
matrix = train.corr()
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(matrix, vmax=.8, square=True, cmap="BuPu");

In [None]:
# Not much correlation between the variables

In [None]:
# program_id vs is_pass
plt.figure(figsize=(12,4))
sns.barplot(train['department'], train['is_promoted'])

In [None]:
plt.figure(figsize=(20,8))
# program_type vs is_pass
sns.barplot(train['region'], train['is_promoted'])

In [None]:
# Trainee in X and Y program type have higher chances to pass the test

In [None]:
# test_type vs is_pass
sns.barplot(train['recruitment_channel'], train['is_promoted'])

In [None]:
# Trainee attending online mode of test have higher chances to pass the test

In [None]:
# difficulty_level vs is_pass
sns.barplot(train['no_of_trainings'], train['is_promoted'])

In [None]:
# If the difficulty level of the test is easy, chances to pass the test are higher

In [None]:
# Gender vs is_pass
sns.barplot(train['previous_year_rating'], train['is_promoted'])

In [None]:
# Gender does not affect the chances to pass the test

In [None]:
# education vs is_pass
plt.figure(figsize=(12,4))
sns.barplot(train['education'], train['is_promoted'])

In [None]:
# Trainee with Masters education level have more chances to pass the test

In [None]:
plt.figure(figsize=(20,8))
# is_handicapped vs is_pass
sns.barplot(train['length_of_service'], train['is_promoted'])

In [None]:
# Handicapped trainee have less chances to pass the test

In [None]:
# city_tier vs is_pass
sns.barplot(train['KPIs_met >80%'], train['is_promoted'])

In [None]:
# Trainee from city tier 1 have higher chances to pass the test

In [None]:
# trainee_engagement_rating vs is_pass
sns.barplot(train['awards_won?'], train['is_promoted'])

In [None]:
# As the trainee engagement rating increases, chances to pass the test also increases

### Missing Values Treatment

In [None]:
# Check the number of missing values in each variable
train.isnull().sum()

In [None]:
# age and trainee_engagement_rating variables have missing values in it.

In [119]:
test = pd.read_csv('test.csv')
test.drop('employee_id',inplace=True,axis = 1)
test.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [120]:
test['education'].fillna('other',inplace=True)
test['previous_year_rating'].fillna(99,inplace=True)

train['education'].fillna('other',inplace=True)
train['previous_year_rating'].fillna(99,inplace=True)

### Logistic Regression

In [121]:
train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [126]:
# Save target variable in separate dataset
X = train.drop('is_promoted',axis=1)
y = train.is_promoted

In [129]:
test

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Technology,region_26,Bachelor's,m,sourcing,1,24,99.0,1,1,0,77
1,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61
5,Procurement,region_7,Bachelor's,m,sourcing,1,36,3.0,2,0,0,68
6,Finance,region_2,Bachelor's,m,other,1,33,5.0,3,1,0,57
7,Analytics,region_22,Bachelor's,m,sourcing,2,36,3.0,3,0,0,85
8,Technology,region_7,Master's & above,m,other,1,51,4.0,11,0,0,75
9,Technology,region_22,Bachelor's,m,sourcing,1,29,5.0,2,1,0,76


In [130]:
# Apply dummies to the dataset
X=pd.get_dummies(X)
test=pd.get_dummies(test)

In [131]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [135]:
# Logistic regression using 5 fold stratified cross validation
i=1
kf = StratifiedKFold(n_splits=5,random_state=1,shuffle=True)
for train_index,test_index in kf.split(X,y):
     print('\n{} of kfold {}'.format(i,kf.n_splits))
     xtr,xvl = X.loc[train_index],X.loc[test_index]
     ytr,yvl = y[train_index],y[test_index]
    
     model = LogisticRegression(random_state=1)
     model.fit(xtr, ytr)
     pred=model.predict_proba(xvl)[:,1]
     score = f1_score(yvl,pred.round())
     print('f1_score',score)
     i+=1
# Making predictions for the test data
pred=model.predict_proba(test)[:,1]


1 of kfold 5
f1_score 0.3530391340549542

2 of kfold 5
f1_score 0.38372093023255816

3 of kfold 5
f1_score 0.3692564745196324

4 of kfold 5
f1_score 0.38259441707717573

5 of kfold 5
f1_score 0.37615449202350965


In [136]:
# Read the submission file
submission=pd.read_csv("sample_submission.csv")

In [137]:
submission.head()

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0
2,72255,0
3,38562,0
4,64486,0


In [138]:
# Fill the is_pass variable with the predictions
submission['is_promoted']=pred.round()

In [139]:
submission.head()

Unnamed: 0,employee_id,is_promoted
0,8724,0.0
1,74430,0.0
2,72255,0.0
3,38562,0.0
4,64486,0.0


In [140]:
submission['is_promoted'].value_counts()

0.0    22935
1.0      555
Name: is_promoted, dtype: int64

In [141]:
# Converting the submission file to csv format
submission.to_csv('logistic_submission.csv', index=False)

score on leaderboard - 0.71145