In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [None]:
# reading the input csv file with pandas
data = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
data.head()

We need to check if there are some indepandent variables which are highly correlated. For creating better model we need to 
reduce collinearity between indepandent variables.<br/>
<b>Why to remove collinearity between the indepandent variables?</b><br>
when variables are highly correlated change in one variable would cause change in another variable so the model results 
fluctuate. Even a small change in the data can results a varied change in the model results.<br><br>
<b>How to check if the there is collinearity between the indepandent variables?</b><br>
1) By Correlation Matrix <br>
2) By Variance inflation factor <br><br>

I am using the correlation matrix to identify the collinearity<br>

In [None]:
corr = data.drop(['Attrition'], axis=1).corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, cmap='YlGnBu')
plt.show()

In the upcoming code we will get rid of the idependent variables with multi collinearity.<br>
Column Age is highly correlated with columns Job Level, MonthlyIncome, NumCompaniesWorked etc.

In [None]:
data['Attrition'] = data['Attrition'].apply(lambda row: 1 if row=='Yes' else 0)
data['Attrition'].value_counts()

there is imbalanced data since the employees which are not retained are very few

<b>Checking if any column contains null values</b>

In [None]:
data.isna().sum()

<b>Checking if any column data type is not correct</b>

In [None]:
data.info()

<b>Getting the descriptive stats of the dataframe</b>

In [None]:
data.describe()

There are some numerical columns from which we can create categorical variables <br>
1) <b>DistanceFromHome</b><br>
   distance from office can be convrted to nearby, middistance and far<br>
2) <b>YearsInCurrentRole</b><br>
   Years in current role can be converted to short, medium and long<br>
similarly there are other columns <b>YearsWithCurrManager</b>, <b>YearsSinceLastPromotion</b>, <b>YearsAtCompany</b> etc<br>

In [None]:
def groupDistanceFromHome(data):
    if int(data) >=1 and int(data) <= 5:
        return 'NearBy'
    elif int(data) >=6 and int(data) <= 15:
        return 'MidDistance'
    else:
        return 'Far'

def groupYearsInCurrentRole(data):
    if int(data) >=0 and int(data) <= 3:
        return 'short'
    elif int(data) >3 and int(data) <= 8:
        return 'medium'
    else:
        return 'long'
    
def groupYearsWithCurrManager(data):
    if int(data) >=0 and int(data) <= 3:
        return 'short'
    elif int(data) >3 and int(data) <= 8:
        return 'medium'
    else:
        return 'long'

def groupYearsSinceLastPromotion(data):
    if int(data) >=0 and int(data) <= 3:
        return 'short'
    elif int(data) >3 and int(data) <= 8:
        return 'medium'
    else:
        return 'long'

def groupYearsAtCompany(data):
    if int(data) >=0 and int(data) <= 3:
        return 'short'
    elif int(data) >3 and int(data) <= 8:
        return 'medium'
    else:
        return 'long'
    
def groupTotalWorkingYears(data):
    if int(data) >=0 and int(data) <= 8:
        return 'short'
    elif int(data) >8 and int(data) <= 15:
        return 'medium'
    else:
        return 'long'

def groupPercentSalaryHike_by_rating(data):
    if int(data) == 3:
        return 'good'
    elif int(data) == 4:
        return 'better'
    else:
        return 'best'

def groupAverageWorkingYearInEachComp(data):
    if int(data) >= 0 and int(data) <= 3:
        return 'short'
    elif int(data) >3 and int(data) <= 8:
        return 'medium'
    else:
        return 'long'

def getAvgWorkingYearInEachComp(TotalWorkingYears, NumCompaniesWorked):
    if NumCompaniesWorked == 0:
        return TotalWorkingYears
    else:
        return TotalWorkingYears / NumCompaniesWorked

In [None]:
data['AverageWorkingYearInEachComp'] = data[['TotalWorkingYears', 'NumCompaniesWorked']].apply(lambda row: 
                                        getAvgWorkingYearInEachComp(row.TotalWorkingYears, row.NumCompaniesWorked), axis=1)
data['AverageWorkingYearInEachComp'] = data['AverageWorkingYearInEachComp'].astype(int)
data['AverageWorkingYearInEachComp'] = data['AverageWorkingYearInEachComp'].apply(lambda row:
                                        groupAverageWorkingYearInEachComp(row))
data['DistanceFromHome'] = data['DistanceFromHome'].apply(lambda row: groupDistanceFromHome(row))
data['YearsInCurrentRole'] = data['YearsInCurrentRole'].apply(lambda row: groupYearsInCurrentRole(row))
data['YearsWithCurrManager'] = data['YearsWithCurrManager'].apply(lambda row: groupYearsWithCurrManager(row))
data['YearsSinceLastPromotion'] = data['YearsSinceLastPromotion'].apply(lambda row: groupYearsSinceLastPromotion(row))
data['YearsAtCompany'] = data['YearsAtCompany'].apply(lambda row: groupYearsAtCompany(row))
data['TotalWorkingYears'] = data['TotalWorkingYears'].apply(lambda row: groupTotalWorkingYears(row))

In [None]:
data['MontlyIncomeByAge'] = data['MonthlyIncome'] / data['Age']
data['MontlyIncomeByAge'] = data['MontlyIncomeByAge'].astype(int)

In [None]:
data = data.drop(['EmployeeNumber', 'EmployeeCount','StandardHours', 'Over18'], axis=1)

In [None]:
corr = data.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, cmap='YlGnBu')
plt.show()

<b>Getting the categorical columns and numerical columns</b>

In [None]:
cat_columns =[]
for col, value in data.drop(['Attrition'], axis=1).iteritems():
    if value.dtype == 'object':
        cat_columns.append(col)
num_columns = data.drop(['Attrition'], axis=1).columns.difference(cat_columns)

In [None]:
print("categorical columns - %s" %(cat_columns))
print("")
print("numerical columns - %s" %(num_columns))

In [None]:
attrition_data = data['Attrition']
cat_data = data[cat_columns]
num_data = data[num_columns]

<b>Creating dummy variables for the categorical variables</b>

In [None]:
cat_data = pd.get_dummies(cat_data)

now concatenating the dummyvariables columns with numerical data and depandent variables

In [None]:
final_data = pd.concat([cat_data, num_data, attrition_data], axis=1)
final_data.head()

<b>converting the data to train and test</b><br>
using the train_size as 0.7 and test_size as 0.3

In [None]:
train_data, test_data = train_test_split(final_data, train_size=0.7, test_size=0.3)

In [None]:
y_train = train_data['Attrition']
X_train = train_data.drop(['Attrition'], axis=1)
y_test = test_data['Attrition']
X_test = test_data.drop(['Attrition'], axis=1)

Normalizing the numerical values.<br>
<b>Why normalizing of data is required?</b><br>
It may happen some of the numerical columns contains outliers. To make our model robut and doesn't get impacted from the outliers we need to normalize the data.<br>

I am using the MinMaxScaler to normalize the numerical columns.

In [None]:
scaler = MinMaxScaler()
X_train_transformed = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

using the below code snippet i am getting the corr matrix and removing the one of columns for which the absolute value 
of corr is greater than 0.8 

In [None]:
corr_matrix = X_train_transformed.corr()
corr_features = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i,j]) > 0.8:
            corr_features.add(corr_matrix.columns[i])
corr_features

In [None]:
# removing the columns for from the transformed data for which the value is greater than 0.8
X_train_transformed = X_train_transformed.drop(columns=list(corr_features), axis=1)

<b>Applying the LogisticRegression with rfecv to get the 10 top most features that lead to employee attrition</b><br>
Logistic Regression is used for <b>Binary Classification</b> and classify the data points to one of the two categories.<br><br>

<b><u>RFECV</u></b> - Recursive Feature Selection and Cross Validation Selection


In [None]:
log_reg = LogisticRegression()
rfe = RFECV(log_reg, cv=StratifiedKFold(5), scoring='neg_mean_squared_error', min_features_to_select=5)
rfe.fit(X_train_transformed, y_train)

In [None]:
X_train_transformed = X_train_transformed.drop(X_train_transformed.columns[np.where(rfe.support_ == False)], axis=1)

In [None]:
important_cols = pd.DataFrame()
important_cols['Cols'] = X_train_transformed.columns
important_cols['Percent'] = rfe.estimator_.coef_[0]

In [None]:
important_cols = important_cols.sort_values(by='Percent', ascending=False)
important_cols

selecting the first 12 features and creating a Logistic Regression model using the same

In [None]:
### getting only the first 12 featues
X_train_10_imp_feature = X_train_transformed[important_cols['Cols'].values[0:12]]

In [None]:
log_reg.fit(X_train_10_imp_feature, y_train)

In [None]:
X_test_transform = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
X_test_transform_required_cols = X_test_transform[X_train_10_imp_feature.columns]

In [None]:
y_test_pred = log_reg.predict(X_test_transform_required_cols)

In [None]:
metrics.accuracy_score(y_test_pred, y_test)