In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import datetime as dt
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
train_df = pd.read_csv("../input/predicting-employee-attrition/train_MpHjUjU.csv")
test_df = pd.read_csv("../input/predicting-employee-attrition/test_hXY9mYw.csv")

print(f'Train Data Shape {train_df.shape}')
print(f'Test Data Shape {test_df.shape}')

In [3]:
train_df.sample(5)

In [4]:
test_df.sample(5)

In [5]:
len(test_df['Emp_ID'].isin(train_df['Emp_ID'])) ### All the test employee id is present in train data frame

In [6]:
train_df.info()

In [7]:
### Output variable with missing values
[col for col in train_df.columns if train_df[col].isna().sum()>0]

In [8]:
### Get number of unique values in each columns
for col in train_df.columns:
    print('-'*60)
    print('Number of Unique Values in {} column is:'.format(col), train_df[col].nunique())

##### *We can see the total number of unique employee is only 2381, meaning we have the duplicate records. It could be that an employee has grown within the company or as the data is by each month we can have some employee repeated each month, etc. We will test all this hypothesis later...*

In [9]:
for col in train_df.columns:
    print('***********{}************'.format(col))
    print(train_df[col].unique(), "\n")

##### Data Finding:
* Data does not contain any errored values.
* Only column with missing value is **LastWorkingDate** which makes sense as not all the employees has left the company

##### Data Preprocessing/ Feature Engineering:
* All the dates column are object type and hence need to change them to datetime
  - We can create some features from this date variables indicating the time-period
* Label Encoding is required for categorical variables
* We have few Ordinal variables so no need to do any encoding on them 

In [10]:
train_df['MMM-YY'].max(), train_df['MMM-YY'].min() 

In [11]:
## Casting variable to their correct data type
# convert to datetime and parse day, month, year, and yymm from the user_created_date
train_df['LastWorkingDate'].fillna(0, inplace=True)
train_df['MMM-YY']  = pd.to_datetime(train_df['MMM-YY']).dt.tz_localize(None)
train_df['Dateofjoining']  = pd.to_datetime(train_df['Dateofjoining']).dt.tz_localize(None)
train_df['LastWorkingDate']  = pd.to_datetime(train_df['LastWorkingDate']).dt.tz_localize(None)

## Create Target variable - 'Attrition {0: current employee, 1: ex-employee}'
train_df['Attrition'] = np.where(train_df['LastWorkingDate']=='1970-01-01', 0, 1)

## Data Aggregation
Max_att_df = train_df.groupby(['Emp_ID','Gender','Education_Level','City', 'Attrition'], as_index=False) \
             ['Age','Salary','Joining Designation','Dateofjoining','Quarterly Rating','LastWorkingDate'].max()
Mode_att_df = train_df.groupby(['Emp_ID','Gender','Education_Level','City', 'Attrition'], as_index=False)['Total Business Value'].sum()

new_train_df0 = pd.merge(Max_att_df, Mode_att_df, on=['Emp_ID','Gender','Education_Level','City', 'Attrition'])

## Calculating Tenure (number of months since resignation)
new_train_df0['Tenure(M)'] = np.where(new_train_df0['LastWorkingDate']!='1970-01-01', 
                                   ((new_train_df0['LastWorkingDate'] - new_train_df0['Dateofjoining'])/np.timedelta64(1, 'M')).astype(int),
                                   ((pd.to_datetime('2017-12-31 23:59:59') - new_train_df0['Dateofjoining'])/np.timedelta64(1, 'M')).astype(int))
                                   

## Calcuating Days on Current Role
# temp = train_df[['MMM-YY', 'Emp_ID', 'Dateofjoining', 'Joining Designation', 'Designation']]
# temp['MonthsOnCurrentRole']=np.where(temp['Joining Designation']!=temp['Designation'], 
#                                   ((temp['MMM-YY'] - temp['Dateofjoining'])/np.timedelta64(1, 'M')).astype(int), 
#                                    0)
# temp1 = temp[temp['MonthsOnCurrentRole']>0]
# temp2 = temp1.groupby(['Emp_ID', 'Dateofjoining', 'Joining Designation', 'Designation'])['MonthsOnCurrentRole'].max().reset_index()

# # getting MonthOnCurrentRole to the original dataframe
# new_train_df1 = new_train_df0.merge(temp2, on=['Emp_ID', 'Dateofjoining', 'Joining Designation'])

new_train_df1 = new_train_df0.groupby(['Emp_ID', 'Gender', 'Education_Level', 'City'], as_index=False) \
                        ['Age', 'Salary', 'Quarterly Rating', 'Tenure(M)'].max()
print(new_train_df1.shape, new_train_df1.Emp_ID.nunique())

new_train_df2 = new_train_df0.groupby(['Emp_ID', 'Gender', 'Education_Level', 'City'], as_index=False) \
                        ['Attrition', 'Total Business Value'].sum()
print(new_train_df2.shape, new_train_df2.Emp_ID.nunique())

new_train_df3 = new_train_df2.merge(new_train_df1, on=['Emp_ID', 'Gender', 'Education_Level', 'City'])
print(new_train_df3.shape, new_train_df3.Emp_ID.nunique())
new_train_df3.head()

In [12]:
new_train_df3[new_train_df3['Emp_ID']==2124]

In [13]:
# Get frequency counts of categorical variables
cat_var = ['Gender', 'Education_Level', 'Attrition', 'Quarterly Rating', 'City']
for i in cat_var:
    print('-'*50, '\n', round(new_train_df3[i].value_counts() / len(new_train_df3) *100, 2))

In [14]:
## Encoding Variables
encoded_df = new_train_df3.copy()
encoded_df['Gender'] = encoded_df['Gender'].map({'Male':0, 'Female':1})
edu_encoded_df = pd.get_dummies(encoded_df['Education_Level'])
city_encoded_df = pd.get_dummies(encoded_df['City'])

final_df0 = pd.concat([encoded_df, edu_encoded_df], axis=1)
final_df1 = pd.concat([final_df0, city_encoded_df], axis=1)

final_df1.drop(['Education_Level', 'City'], axis=1, inplace=True) #'LastWorkingDate', 'Dateofjoining'
# print(new_train_df0.shape)
print(final_df1.shape)
final_df1.head()

In [15]:
final_df1.select_dtypes(exclude=['int', 'uint8', 'int64']).columns

In [16]:
# checking target variable distribution
round(final_df1.Attrition.value_counts() / len(final_df1) * 100, 2)

In [17]:
# bringing test_df 
test_df1 = test_df.merge(final_df1, on=['Emp_ID'])
print(test_df1.shape, test_df1.Emp_ID.nunique())
print(test_df1.Attrition.value_counts())

test_df1.drop(['Attrition'], axis=1, inplace=True)

''' None of the employee has left the company (non attrition)'''
test_df1.head()

In [18]:
final_train_df = final_df1[~final_df1['Emp_ID'].isin(test_df['Emp_ID'])]
print(final_train_df.shape, final_train_df.Emp_ID.nunique())
final_train_df.head()

In [19]:
# checking target variable distribution
round(final_train_df.Attrition.value_counts() / len(final_train_df) * 100, 2)

''' Need to apply SMOTE as target is highly imbalanced '''

In [20]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [21]:
X = final_train_df.drop(['Attrition', 'Emp_ID'], axis=1)
y = final_train_df[['Attrition']]

# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_val = train_test_split(X, y, train_size= 0.85, random_state=0)

oversampler=SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_resample(train, target_train)

In [22]:
print('train', train.shape, 'test', test.shape, 'target_train', target_train.shape, 'target_val', target_val.shape )
smote_train.shape, smote_target.shape

#### RF Classifier

In [23]:
seed = 0   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
#     'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [24]:
rf = RandomForestClassifier() #
rf.fit(smote_train, smote_target)
print("Fitting of Random Forest finished")

rf_predictions = rf.predict(test)
print("Predictions finished")

''' Scoring the model '''
print("Accuracy score: {}".format(accuracy_score(target_val, rf_predictions)))
print("="*80)
print(classification_report(target_val, rf_predictions))

In [25]:
# Gradient Boosting Parameters
gb_params ={
    'n_estimators': 1500,
    'max_features': 0.9,
    'learning_rate' : 0.25,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'subsample': 1,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

gb = GradientBoostingClassifier() #**gb_params
# Fit the model to our SMOTEd train and target
gb.fit(smote_train, smote_target)
# Get our predictions
gb_predictions = gb.predict(test)
print("Predictions have finished")

print(accuracy_score(target_val, gb_predictions))
print(classification_report(target_val, gb_predictions))

In [26]:
### classifier models
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('Naive Bayes',GaussianNB()))
models.append(('GradientBoost',GradientBoostingClassifier()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5)))

In [27]:
# Make predictions on validation dataset

for name, model in models:
    print(name)
    model.fit(smote_train, smote_target)
    
    # Make predictions.
    predictions = model.predict(test)

    # Compute the error.
#     from sklearn.metrics import confusion_matrix
    print(classification_report(target_val, predictions))

    from sklearn.metrics import accuracy_score
    print(accuracy_score(target_val, predictions))
    print('\n')

In [28]:
# highest accuracy score is with RandomForest
final_test_df = test_df1.drop('Emp_ID', axis=1)
val_pred = gb.predict(final_test_df)
val_pred[:5]

In [29]:
test_df['Target'] = val_pred
print(test_df.shape)
test_df.head()

In [30]:
test_df.Target.value_counts()

In [31]:
test_df.to_csv('Emp_Attr_V1.csv', index=None)

### --------- Now Folloing the same attr and method but on full dataset (including test datast emp_id) -------

In [32]:
final_df1.Attrition.value_counts() / len(final_df1)

In [33]:
X = final_df1.drop(['Attrition', 'Emp_ID'], axis=1)
y = final_df1[['Attrition']]

# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_val = train_test_split(X, y, train_size= 0.80, random_state=0)

oversampler=SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_resample(train, target_train)

### classifier models
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('Naive Bayes',GaussianNB()))
models.append(('GradientBoost',GradientBoostingClassifier()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5)))

# Make predictions on validation dataset

for name, model in models:
    print(name)
    model.fit(smote_train, smote_target)
    
    # Make predictions.
    predictions = model.predict(test)

    # Compute the error.
#     from sklearn.metrics import confusion_matrix
    print(classification_report(target_val, predictions))

    from sklearn.metrics import accuracy_score
    print(accuracy_score(target_val, predictions))
    print('\n')

In [34]:
rf = RandomForestClassifier() #
rf.fit(smote_train, smote_target)
print("Fitting of Random Forest finished")

rf_predictions = rf.predict(test)
print("Predictions finished")

''' Scoring the model '''
print("Accuracy score: {}".format(accuracy_score(target_val, rf_predictions)))
print("="*80)
print(classification_report(target_val, rf_predictions))

In [35]:
# highest accuracy score is with RandomForest and GB
final_test_df = test_df1.drop('Emp_ID', axis=1)
val_pred = rf.predict(final_test_df)
test_df.drop('Target', axis=1, inplace=True)
test_df['Target'] = val_pred

test_df.to_csv('Emp_Attr_V2.csv', index=None)

print(test_df.shape)
test_df.head()

In [36]:
test_df.Target.value_counts()

### ----- Now Without SMOTE and on Overall data (including test employee) ------

In [37]:
X = final_df1.drop(['Attrition', 'Emp_ID'], axis=1)
y = final_df1[['Attrition']]

# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_val = train_test_split(X, y, train_size= 0.80, random_state=0)

# oversampler=SMOTE(random_state=0)
# smote_train, smote_target = oversampler.fit_resample(train, target_train)

### classifier models
models = []
models.append(('LogisticRegression', LogisticRegression()))
models.append(('Naive Bayes',GaussianNB()))
models.append(('GradientBoost',GradientBoostingClassifier()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5)))


# Make predictions on validation dataset

for name, model in models:
    print(name)
    model.fit(train, target_train)
    
    # Make predictions.
    predictions = model.predict(test)

    # Compute the error.
#     from sklearn.metrics import confusion_matrix
    print(classification_report(target_val, predictions))

    from sklearn.metrics import accuracy_score
    print(accuracy_score(target_val, predictions))
    print('\n')

In [38]:
gb = GradientBoostingClassifier() #
gb.fit(train, target_train)
print("Fitting of Random Forest finished")

gb_predictions = gb.predict(test)
print("Predictions finished")

''' Scoring the model '''
print("Accuracy score: {}".format(accuracy_score(target_val, gb_predictions)))
print("="*80)
print(classification_report(target_val, gb_predictions))

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
# print(cross_val_score(svm.SVC(kernel='rbf', gamma=0.7, C = 1.0), X, y, 
#                 scoring=make_scorer(f1_score, average='weighted', labels=[2]), cv=10))
print(cross_val_score(gb, X, y, cv=10, scoring='accuracy').mean())

In [40]:
# highest accuracy score is with  GB
final_test_df = test_df1.drop('Emp_ID', axis=1)
val_pred = gb.predict(final_test_df)
test_df.drop('Target', axis=1, inplace=True)
test_df['Target'] = val_pred

test_df.to_csv('Emp_Attr_V3.csv', index=None)

print(test_df.shape)
print(test_df.head(3))
test_df.Target.value_counts()

In [41]:
# checking V3 with  RF
rf = RandomForestClassifier() #
rf.fit(train, target_train)
print("Fitting of Random Forest finished")

rf_predictions = rf.predict(test)
final_test_df = test_df1.drop('Emp_ID', axis=1)
val_pred = rf.predict(final_test_df)
test_df.drop('Target', axis=1, inplace=True)
test_df['Target'] = val_pred

test_df.to_csv('Emp_Attr_V4.csv', index=None)

print(test_df.shape)
print(test_df.head(3))
test_df.Target.value_counts()

In [42]:


X = final_train_df.drop(['Attrition'], axis=1)
y = final_train_df[['Attrition']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
log_reg=LogisticRegression(C=1000, max_iter=10000)
log_reg.fit(X_train, y_train)
pred = log_reg.predict(X_test)

print('--------------------------------------------------------------------------')
print('Logistic Regression:')
print('Traning Model accruracy scores: {:.3f}'.format(log_reg.score(X_train,y_train)))
print('Test Model accruracy scores: {:.3f}'.format(log_reg.score(X_test,y_test)))
print('Compute the error: \n', confusion_matrix(pred, y_test))
print('--------------------------------------------------------------------------')

In [43]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(log_reg, X, y, cv=5, scoring ='accuracy').mean())