In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/employee-attrition/employee_attrition_train.csv')
data.head()

In [None]:
data.info()

# UNDERSTANDING DATA DISTRIBUTION

In [None]:
# check for imbalanced data
print(data['Attrition'].unique())
plt.pie(data['Attrition'].value_counts(), autopct='%1.1f%%', labels=['No', 'Yes']);

In [None]:
def plot_category(feature, figsize=None):
    yes_count = data[data['Attrition']=='Yes'].groupby([feature]).size()
    no_count = data[data['Attrition']=='No'].groupby([feature]).size()
    labels = no_count.index

    x = np.arange(len(labels)) # the label locations
    width = 0.35  # the width of the bars

    if figsize:
        fig, ax = plt.subplots(figsize=figsize)
    else:
        fig, ax = plt.subplots()
    rects1 = ax.bar(x-width/2, round(yes_count*100/data.groupby([feature]).size(), 2), 
                    width, label='Yes')
    rects2 = ax.bar(x+width/2, round(no_count*100/data.groupby([feature]).size(), 2), 
                    width, label='No')

    ax.set_ylabel('Count')
    ax.set_title('Based on %s'%feature)
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=80)
    ax.legend();

    ax.bar_label(rects1, padding=1)
    ax.bar_label(rects2, padding=1)

    fig.tight_layout()
    plt.show()
    
def plot_numerical(feature, figsize=None):
    # Attrition vs Age Distribution
    fig = plt.figure(figsize=(10,6))

    sns.kdeplot(data[data['Attrition']=='No'][feature])
    sns.kdeplot(data[data['Attrition']=='Yes'][feature])

    fig.legend(labels=['Attrition No', 'Attrition Yes'])
    plt.title('Based on %s'%feature)
    plt.show()

In [None]:
for feature in ['Age']:
    plot_numerical(feature)

In [None]:
for feature in ['DailyRate', 'HourlyRate', 'MonthlyRate']:
    plot_numerical(feature)

In [None]:
for feature in ['BusinessTravel']:
    plot_category(feature)

So More you travel higher the chances of attrition

In [None]:
for feature in ['Department', 'JobRole', 'Education', 'EducationField']:
    plot_category(feature, figsize=(8,5))

In [None]:
for feature in ['DistanceFromHome']:
    plot_numerical(feature)

In [None]:
for feature in ['EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction']:
    plot_category(feature, figsize=(8,5))

In [None]:
for feature in ['Gender']:
    plot_category(feature)

In [None]:
for feature in ['JobLevel']:
    plot_category(feature)

In [None]:
for feature in ['MaritalStatus', 'Over18']:
    plot_category(feature)

In [None]:
for feature in ['MonthlyIncome', 'TotalWorkingYears']:
    plot_numerical(feature)

In [None]:
for feature in ['NumCompaniesWorked']:
    plot_numerical(feature, figsize=(8,5))

In [None]:
for feature in ['OverTime', 'StandardHours', 'WorkLifeBalance']:
    plot_category(feature)

In [None]:
for feature in ['PercentSalaryHike']:
    plot_numerical(feature)

In [None]:
for feature in ['PerformanceRating']:
    plot_category(feature)

In [None]:
for feature in ['YearsSinceLastPromotion', 'YearsInCurrentRole', 'YearsAtCompany', 
                'YearsWithCurrManager']:
    plot_numerical(feature)

In [None]:
for feature in [ 'RelationshipSatisfaction', 'StockOptionLevel', 'TrainingTimesLastYear']:
    plot_category(feature)

**Columns that seem to contribute towards Attrition:**<br>
1. YearsWithCurrManager < 5
2. YearsAtCompany < 5
3. YearsInCurrentRole < 4
4. TotalWorkingHours < 10
6. DailyRate < 1000
7. NumCompaniesWorked > 5
8. MonthlyIncome < 5000
9. Age < 35
10. TrainingTimeLastYear - 0
11. StockOptionLevel - 0 
12. OverTime - yes 
13. JobRole - Sales Representative
14. Married - Single
15. JobLevel - 1
16. BusinessTravel - travel frequently
17. EducationField - Technical Field, Human Resources
18. WorkLifeBalance - 1
19. EnvironmentSatisfaction - 1
20. JobInvolvement - 1
21. JobSatisfaction - 1

In [None]:
categorical_features = ['BusinessTravel', 'Department', 'JobRole', 'Education', 
                        'EducationField', 'Gender', 'MaritalStatus', 'OverTime']
numerical_features = ['Age', 'DailyRate', 'DistanceFromHome', 'EnvironmentSatisfaction',
                      'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
                      'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
                      'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
                      'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
                      'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
                      'YearsWithCurrManager']

to_drop = ['StandardHours', 'Over18', 'EmployeeCount', 'EmployeeNumber'] # contain only single unique value

# Label Encoding Categorical Features for Correlation (includes missing values)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import os
import joblib

In [None]:
df = data.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    le = LabelEncoder()

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))

# CORRELATION

In [None]:
# Bivariate Analysis Correlation plot with the Numeric variables
plt.figure(figsize=(15, 15))
sns.heatmap(round(data[numerical_features].corr(), 2), annot=True,
            mask=None, cmap='GnBu')
corr_mat = data[numerical_features].corr()
plt.show()

In [None]:
# Correlated Features
s = corr_mat.unstack()
so = s.sort_values(kind="quicksort").drop_duplicates()
res1 = so[so>=0.5]
print(res1)

**Observations:**

1. As age increases the TotalWorkingYears(experience) increases.
2. Monthly Income is directly proportional to Job level & TotalWorkingYears. Employees at Higher position & more experience gets more income.
3. Higher performance rating bring higher percent salary hikes
4. TotalWorkingYears-YearsAtCompany shows that people who have more experience might be liking to continue their association with the company
5. YearsAtCompany, YearsWithCurrManager, YearsInCurrentRole shows a positive correlation among each other.

In [None]:
# Bivariate Analysis Correlation plot with the Categorical variables
plt.figure(figsize=(20, 20))
sns.heatmap(round(df[categorical_features+numerical_features].corr(method='spearman'), 2), annot=True,
            mask=None, cmap='GnBu')
plt.show()

**Observations:**

1. Department - JobRole
2. MaritalStatus - StockOptionLevel (-)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Calculating VIF
vif = pd.DataFrame()
temp = df.dropna()
vif["variables"] = [feature for feature in categorical_features+numerical_features if feature not in ['PerformanceRating', 'JobLevel',
                                                                                                     'Age', 'PercentSalaryHike',
                                                                                                     'WorkLifeBalance', 'JobInvolvement',
                                                                                                     'Department', 'YearsAtCompany']]
vif["VIF"] = [variance_inflation_factor(temp[vif['variables']].values, i) for i in range(len(vif["variables"]))]
print(vif)

# Handling Missing Values

In [None]:
missingValueFeatures = pd.DataFrame({'missing %': data.isnull().sum()*100/len(data)})
missingValueFeatures[missingValueFeatures['missing %']>0]

|Column|Correlation|
|---|---|
|Age|TotalWorkingYears| 
|BusinessTravel|NA| 
|DailyRate|   NA|
|DistanceFromHome|NA|
|MaritalStatus|StockOptionLevel|

We can impute missing values as per the correlation table above. Columns with NA values can be replaced by mean, mode, median or back fill methods

In [None]:
# Imputing BusinessTravel with Back fill
print('Before Imputation:')
print(data[['BusinessTravel']].value_counts())
data['BusinessTravel'].fillna(method='bfill', inplace=True)
print('\nAfter Imputation:')
print(data[['BusinessTravel']].value_counts())

In [None]:
# Imputing DailyRate and DistanceFromHome with Mean values
print('Before Imputation:')
print(data[['DailyRate', 'DistanceFromHome']].describe().T)
data[['DailyRate', 'DistanceFromHome']] = data[['DailyRate', 'DistanceFromHome']].fillna(data[['DailyRate', 'DistanceFromHome']].mean())
print('\nAfter Imputation:')
print(data[['DailyRate', 'DistanceFromHome']].describe().T)

In [None]:
# Imputing Age as per TotalWorkingYears

print('Before Imputation:')
print(data[['Age']].describe().T)

data.sort_values(by='TotalWorkingYears', inplace=True)

# now use backfill method to replace Age
data['Age'].fillna(method='bfill', inplace=True)

print('\nAfter Imputation:')
print(data[['Age']].describe().T)

In [None]:
# Imputing MaritalStatus as per StockOptionLevel
print(pd.crosstab(data['MaritalStatus'], data['StockOptionLevel']))
print('\nStockOptionLevel Distribution across missing MaritalStatus values:')
print(data[data['MaritalStatus'].isna()]['StockOptionLevel'])

In [None]:
# For StockOptionLevel 1 & 2 mode of MaritalStatus is Married
print('\nBefore Imputation:')
print(data[['MaritalStatus']].value_counts())
data['MaritalStatus'].fillna(data['MaritalStatus'].mode()[0], inplace=True)
print('\nAfter Imputation:')
print(data[['MaritalStatus']].value_counts())

In [None]:
# verifying missing values
data.info()

# Looking at Outliers

In [None]:
NumericData = data[[feature for feature in numerical_features if feature not in ['MonthlyIncome', 
                                                                               'MonthlyRate', 
                                                                               'DailyRate',
                                                                              'HourlyRate']]]
NumericMelt = NumericData.melt()
plt.figure(figsize=(15,10))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
NumericData = data[['DailyRate', 'HourlyRate']]
# skipping 'MonthlyIncome', 'MonthlyRate', 'DailyRate' 
# due to very different range of values compared to others
NumericMelt = NumericData.melt()
plt.figure(figsize=(8,5))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

In [None]:
NumericData = data[['MonthlyIncome', 'MonthlyRate']]
# skipping 'MonthlyIncome', 'MonthlyRate', 'DailyRate' 
# due to very different range of values compared to others
NumericMelt = NumericData.melt()
plt.figure(figsize=(8,5))
plt.title("Boxplots for Numerical variables")
bp = sns.boxplot(x='variable', y='value', data=NumericMelt)
bp.set_xticklabels(bp.get_xticklabels(), rotation=90)
plt.show()

**Columns with Outlier values:**<br>
1. TotalWorkingYears
2. YearsAtCompany
3. YearsInCurrentRole
4. YearsSinceLastPromotion
5. YearsWithCurrManager
6. TrainingTimesLastYear
7. NumCompaniesWorked
8. MonthlyIncome<br>
Other columns like PerformanceRating are not considered in outliers as they have very few unique values

In [None]:
# Percentage of outliers present in each variable
outlier_percentage = {}
for feature in ['TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
                'YearsWithCurrManager', 'TrainingTimesLastYear', 'NumCompaniesWorked', 'MonthlyIncome']:
    tempData = data.sort_values(by=feature)[feature]
    Q1, Q3 = tempData.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    Lower_range = Q1 - (1.5 * IQR)
    Upper_range = Q3 + (1.5 * IQR)
    outlier_percentage[feature] = round((((tempData<(Q1 - 1.5 * IQR)) | (tempData>(Q3 + 1.5 * IQR))).sum()/tempData.shape[0])*100,2)
outlier_percentage

In [None]:
# Outlier treatment with more thatn 4% outlier values
df_outlier = data.copy()
for feature in ['TotalWorkingYears', 'YearsAtCompany', 'YearsSinceLastPromotion', 'TrainingTimesLastYear', 'MonthlyIncome']:
    tempData = df_outlier.sort_values(by=feature)[feature]
    Q1, Q3 = tempData.quantile([0.25, 0.75])
    IQR = Q3 - Q1
    Lower_range = Q1 - (1.5 * IQR)
    Upper_range = Q3 + (1.5 * IQR)    
    df_outlier.loc[(df_outlier[feature]<(Q1 - 1.5 * IQR))|(df_outlier[feature]>(Q3 + 1.5 * IQR)), 
                   feature] = Upper_range

# Handling Categorical Features (Label and One Hot Encoding)

In [None]:
df = df_outlier.copy()
path = '/kaggle/working'
for i, feature in enumerate(categorical_features):
    
    le = LabelEncoder()
    ohe = OneHotEncoder(sparse=False)

    # create directory to save label encoding models
    if not os.path.exists(os.path.join(path, "TextEncoding")):
        os.makedirs(os.path.join(path, "TextEncoding"))

    # perform label encoding
    le.fit(df[feature])
    # save the encoder
    joblib.dump(le, open(os.path.join(path, "TextEncoding/le_{}.sav".format(feature)), 'wb'))
    
    # transfrom training data
    df[feature] = le.transform(df[feature])

    # get classes & remove first column to elude from dummy variable trap
    columns = list(map(lambda x: feature+' '+str(x), list(le.classes_)))[1:]
    
    # save classes
    joblib.dump(columns, 
                open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'wb'))
    # load classes
    columns = joblib.load(
        open(os.path.join(path, "TextEncoding/le_{}_classes.sav".format(feature)), 'rb'))

    if len(le.classes_)>2:
        # perform one hot encoding
        ohe.fit(df[[feature]])
        # save the encoder
        joblib.dump(ohe, 
                    open(os.path.join(path, "TextEncoding/ohe_{}.sav".format(feature)), 'wb'))

        # transfrom training data
        # removing first column of encoded data to elude from dummy variable trap
        tempData = ohe.transform(df[[feature]])[:, 1:]

        # create Dataframe with columns as classes
        tempData = pd.DataFrame(tempData, columns=columns)
    else:
        tempData = df[feature]
    
    # create dataframe with all the label encoded categorical features along with hot encoding
    if i==0:
        encodedData = pd.DataFrame(data=tempData, columns=tempData.columns.values.tolist())
    else:
        encodedData = pd.concat([encodedData, tempData], axis=1)

In [None]:
# merge numerical features and categorical encoded features
df = df[numerical_features+['Attrition']]
df = pd.concat([df, encodedData], axis=1)
df.info()

# Training Model

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC

In [None]:
train_data = df.copy()
feature_cols = [feature for feature in train_data.columns if feature not in(['Attrition', 'PerformanceRating', 'JobLevel',
                                                                             'Age', 'PercentSalaryHike',
                                                                             'WorkLifeBalance', 'JobInvolvement',
                                                                             'YearsAtCompany', 'Department Research & Development',
                                                                             'Department Sales'])]

''' Rescaling to [0,1] '''
scaler = MinMaxScaler()
scaler.fit(train_data[feature_cols])
train_data[feature_cols] = scaler.transform(train_data[feature_cols])

In [None]:
X = train_data[feature_cols]
y = train_data['Attrition']
y.replace('No', 0, inplace=True)
y.replace('Yes', 1, inplace=True)

validation_size = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_size, 
                                                    random_state=4, stratify=y)

# Model 1: Logistic Regression

In [None]:
model = LogisticRegression(class_weight={0:1, 1:10})
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

y_pred = model.predict(X_test)

print('Validation metrics...')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
''' metrics on original data '''
y_pred = model.predict(train_data[feature_cols])

def make_cm(matrix, columns):
    n = len(columns)
    act = ['actual Attrition'] * n
    pred = ['prediction Attrition'] * n

    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

df_matrix=make_cm(
    confusion_matrix(train_data['Attrition'], y_pred),['No','Yes'])

display(df_matrix)
print(classification_report(train_data['Attrition'], y_pred))

# Model 2: SVM

In [None]:
model = SVC(class_weight={0: 1, 1: 10})
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_train)

print('Train metrics...')
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

y_pred = model.predict(X_test)

print('Test metrics...')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
''' metrics on original data '''
y_pred = model.predict(train_data[feature_cols])

def make_cm(matrix, columns):
    n = len(columns)
    act = ['actual Attrition'] * n
    pred = ['prediction Attrition'] * n

    cm = pd.DataFrame(matrix, 
        columns=[pred, columns], index=[act, columns])
    return cm

df_matrix=make_cm(
    confusion_matrix(train_data['Attrition'], y_pred),['No','Yes'])

display(df_matrix)
print(classification_report(train_data['Attrition'], y_pred))