In [78]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving train.csv to train.csv
Saving test.csv to test.csv
Saving sample_submission.csv to sample_submission.csv
User uploaded file "train.csv" with length 3759647 bytes
User uploaded file "test.csv" with length 1565147 bytes
User uploaded file "sample_submission.csv" with length 208067 bytes


In [0]:
import numpy as np
import pandas as pd 

#keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.layers import BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import ReduceLROnPlateau
from keras.wrappers.scikit_learn import KerasClassifier

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


#visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [83]:
train = pd.read_csv('train.csv')
train.drop('employee_id',inplace=True,axis = 1)
train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [84]:
train.dtypes

department               object
region                   object
education                object
gender                   object
recruitment_channel      object
no_of_trainings           int64
age                       int64
previous_year_rating    float64
length_of_service         int64
KPIs_met >80%             int64
awards_won?               int64
avg_training_score        int64
is_promoted               int64
dtype: object

In [85]:
train.isnull().sum()

department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [86]:
train.nunique()

department               9
region                  34
education                3
gender                   2
recruitment_channel      3
no_of_trainings         10
age                     41
previous_year_rating     5
length_of_service       35
KPIs_met >80%            2
awards_won?              2
avg_training_score      61
is_promoted              2
dtype: int64

In [0]:
train['education'].fillna('other',inplace=True)

In [0]:
train['previous_year_rating'].fillna(99,inplace=True)

In [89]:
train.isnull().sum()

department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [90]:
train['is_promoted'].value_counts(normalize = True)

0    0.91483
1    0.08517
Name: is_promoted, dtype: float64

In [91]:
train.shape

(54808, 13)

In [0]:
y = train['is_promoted']
train = train.drop(['is_promoted'],axis = 1)

In [93]:
train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73


In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(train,y,test_size=0.2)

In [95]:
print('Xtrain shape',X_train.shape)
print('Xvalid shape',X_valid.shape)
print('ytrain shape',y_train.shape)
print('yvalid shape',y_valid.shape)

Xtrain shape (43846, 12)
Xvalid shape (10962, 12)
ytrain shape (43846,)
yvalid shape (10962,)


In [0]:
categoical_vars = ['department','education','gender','region','recruitment_channel']
continous_vars = ['age','no_of_trainings','previous_year_rating','length_of_service','KPIs_met >80%','awards_won?','avg_training_score']
categoical_binary_vars = []

In [97]:
print ("categorical binary vars: ", len(categoical_binary_vars))
print ("categorical non binary vars: ", len(categoical_vars))
print ("continues vars: ", len(continous_vars))

categorical binary vars:  0
categorical non binary vars:  5
continues vars:  7


In [98]:
for cat_var in categoical_vars:
    print (cat_var, train[cat_var].nunique())

department 9
education 4
gender 2
region 34
recruitment_channel 3


In [0]:
all_cols = train.columns
other_cols = [i for i in all_cols if i not in categoical_vars ]

In [100]:
other_cols

['no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score']

In [0]:
# Label Encoder
for c in categoical_vars:
    lbl = LabelEncoder()
    lbl.fit(list(X_train[c].values) + list(X_valid[c].values))
    X_train[c] = lbl.transform(list(X_train[c].values))
    X_valid[c] = lbl.transform(list(X_valid[c].values))

In [103]:
X_train.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
50264,1,24,0,1,2,2,27,99.0,1,1,0,60
51276,7,4,0,1,2,1,34,3.0,6,1,0,55
13368,0,11,0,1,2,1,29,99.0,1,1,0,83
4932,4,31,0,0,0,1,36,5.0,7,1,0,58
20504,2,14,0,1,0,2,30,1.0,4,0,0,51


In [0]:
def preproc(X_train ) : 

    input_list_train = []
    
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in categoical_vars :
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        
     
    #the rest of the columns
    
    input_list_train.append(X_train[other_cols].values)
  
    
    return input_list_train 

In [105]:
X_train[other_cols].ndim

2

In [0]:
X_train = preproc(X_train)


In [0]:
X_valid  = preproc(X_valid)

In [108]:
len(X_train)

6

In [109]:
len(X_train[5]) 

43846

In [110]:
len(categoical_vars)

5

In [111]:
len(X_train[4])

43846

In [112]:
X_train[5].ndim


2

In [113]:
len(X_train[5][0]) 

7

In [0]:
from keras.layers import *
from keras.models import *

models = []

for categoical_var in categoical_vars :

    model = Sequential()
    no_of_unique_cat  = train[categoical_var].nunique()
    embedding_size = min(np.ceil((no_of_unique_cat)/2),50)
    embedding_size = int(embedding_size)
    model.add(  Embedding( no_of_unique_cat+1, embedding_size, input_length = 1 ) )
    model.add(Reshape(target_shape=(embedding_size,)))
    models.append( model )


model_rest = Sequential()
model_rest.add(Dense(16, input_dim = 7 ))

models.append(model_rest)

full_model = Sequential()

full_model.add(Merge(models, mode='concat'))

full_model.add(Dense(256))
full_model.add(Activation('relu'))
full_model.add(Dropout(0.5))
full_model.add(BatchNormalization())

full_model.add(Dense(128))
full_model.add(Activation('relu'))
full_model.add(Dropout(0.5))
full_model.add(BatchNormalization())

full_model.add(Dense(64))
full_model.add(Activation('relu'))
full_model.add(Dropout(0.5))
full_model.add(BatchNormalization())

full_model.add(Dense(1))
full_model.add(Activation('sigmoid'))


In [126]:
len(models)


6

In [0]:
sgd = SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)


In [0]:
rmsprop = RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)


In [0]:
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)


In [0]:
checkpointer = ModelCheckpoint(filepath='best_weights.hdf5', verbose=1, save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=3, min_lr=0.00001,verbose = 1)
early_stopping = EarlyStopping(monitor='val_loss',min_delta=0.0001, patience=5,verbose=1)

In [0]:
full_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy',f1])

In [144]:
full_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_6 (Merge)              (None, 43)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 256)               11264     
_________________________________________________________________
activation_21 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
batch_normalization_10 (Batc (None, 256)               1024      
_________________________________________________________________
dense_28 (Dense)             (None, 128)               32896     
_________________________________________________________________
activation_22 (Activation)   (None, 128)               0         
__________

In [145]:
model_info = full_model.fit(X_train,y_train,epochs=50,batch_size=32,validation_data=(X_valid,y_valid),verbose = 1)

Train on 43846 samples, validate on 10962 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

KeyboardInterrupt: ignored

In [45]:
test = pd.read_csv('test.csv')
test.drop('employee_id',inplace=True,axis = 1)
test.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,Technology,region_26,Bachelor's,m,sourcing,1,24,,1,1,0,77
1,HR,region_4,Bachelor's,f,other,1,31,3.0,5,0,0,51
2,Sales & Marketing,region_13,Bachelor's,m,other,1,31,1.0,4,0,0,47
3,Procurement,region_2,Bachelor's,f,other,3,31,2.0,9,0,0,65
4,Finance,region_29,Bachelor's,m,sourcing,1,30,4.0,7,0,0,61


In [46]:
test.isnull().sum()

department                 0
region                     0
education               1034
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    1812
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
dtype: int64

In [47]:
test['education'].fillna('other',inplace=True)
test['previous_year_rating'].fillna(99,inplace=True)
test.isnull().sum()

department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
dtype: int64

In [0]:
test = pd.get_dummies(test)

In [49]:
test.head()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,department_Analytics,department_Finance,department_HR,...,region_region_9,education_Bachelor's,education_Below Secondary,education_Master's & above,education_other,gender_f,gender_m,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
0,1,24,99.0,1,1,0,77,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1,1,31,3.0,5,0,0,51,0,0,1,...,0,1,0,0,0,1,0,1,0,0
2,1,31,1.0,4,0,0,47,0,0,0,...,0,1,0,0,0,0,1,1,0,0
3,3,31,2.0,9,0,0,65,0,0,0,...,0,1,0,0,0,1,0,1,0,0
4,1,30,4.0,7,0,0,61,0,1,0,...,0,1,0,0,0,0,1,0,0,1
