In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import seaborn as sns
import matplotlib.pyplot as plt

pd.pandas.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
df.head()

In [None]:
df.shape

We can drop these column, as they are just some indicators
* EmployeeCount
* EmployeeNumber
* StandardHours

In [None]:
df.drop(columns=['EmployeeCount', 'EmployeeNumber', 'StandardHours'], axis=1, inplace=True)

**Attrition** is the `target` column

Obtain a count plot of the column

In [None]:
target = 'Attrition'

In [None]:
sns.countplot(x=target, data=df)

From the countplot it can be observed that the target is imbalanced.

Later in this notebook, we will alter it.

Now check if any features contains null values

In [None]:
df.info()

In [None]:
df.isnull().sum()

So from the above series it can be concluded that there are no missing values

### Now it's time to do some data vizs
We will start with category variables

In [None]:
# Categorical Variables

cat_vars = [var for var in df.columns if df[var].dtype == 'O' and var != target]

# There are few other categorical features which are not by default
# We will analyze those variables also
xtra_vars = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel' ,'JobSatisfaction',
             'PerformanceRating', 'RelationshipSatisfaction', 'WorkLifeBalance', 'NumCompaniesWorked',
             'StockOptionLevel', 'PercentSalaryHike', 'TrainingTimesLastYear']


cat_vars = cat_vars + xtra_vars

In [None]:
cat_vars

In [None]:
def plot_cat(var, dataframe):
    plt.figure(figsize=(16, 4))
    sns.countplot(x=var, hue=target, data=dataframe)
    plt.show()

In [None]:
for i in cat_vars:
    plot_cat(i, df)

## Let's plot a percentage plot to check what percent of employees
## left at each features

In [None]:
def plot_cat_percent(var, dataframe):
    plt.figure(figsize=(16, 4))
    ys_df = df[df[target] == 'Yes'].groupby(var).count()[target]
    no_df = df[df[target] == 'No'].groupby(var).count()[target]
    rat = ys_df / (ys_df + no_df) * 100
    rat.plot(kind='bar')
    plt.show()

In [None]:
for i in cat_vars:
    plot_cat_percent(i, df)

These columns are not thatmuch contributing to the attrition rate

**`Gender, Over18, PerformanceRating`**

In those features ratio of employees leaving are almost the same, So we can drop those columns

## Numerical Variables

In [None]:
num_vars = [var for var in df.columns if var not in cat_vars and var!=target]

In [None]:
num_vars

In [None]:
df[num_vars].hist(bins=30, figsize=(15,15))
plt.show()

In [None]:
for i in num_vars:
    sns.boxplot(x=target, y=i, data=df)
    plt.show()


In [None]:
df[num_vars + [target]].groupby(target).describe()

**HourlyRate**, **MonthlyRate**, **YearsSinceLastPromotion** are not significant features here  

In [None]:
num_vars.remove('HourlyRate')
num_vars.remove('MonthlyRate')
num_vars.remove('YearsSinceLastPromotion')

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df[num_vars].corr(), annot=True)

From the heatmap it can be observed that there are some mutually correlated features

In [None]:
sns.scatterplot(x=df['TotalWorkingYears'], y=df['MonthlyIncome'])

Its predictable, since more the experience more the salary

In [None]:
sns.scatterplot(y=df['TotalWorkingYears'], x=df['Age'])

Same inference as above

In [None]:
sns.scatterplot(x=df['YearsAtCompany'], y=df['TotalWorkingYears'])

In [None]:
sns.scatterplot(y=df['YearsInCurrentRole'], x=df['TotalWorkingYears'])

In [None]:
sns.scatterplot(y=df['YearsWithCurrManager'], x=df['TotalWorkingYears'])

We can remove `age` from the features set, because age is correlated with working years.


In [None]:
num_vars.remove('Age')

In [None]:
cat_vars.remove('Gender')
cat_vars.remove('Over18')
cat_vars.remove('PerformanceRating')

In [None]:
num_vars

In [None]:
# selected features
fqs = num_vars + cat_vars + [target]

In [None]:
print(len(fqs), len(df.columns))

In [None]:
df = df[fqs]

Now we will encode various categories

We will encode **OverTime**, **Attrition** with OrdinalEncoder, and remaining with One Hot Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [None]:
enc = OrdinalEncoder()
oc_data = enc.fit_transform(df[['OverTime','Attrition']].values)

In [None]:
# DRop those mentioned columns and replace them with oc_data
ord_cols = ['OverTime','Attrition']
oc_df = pd.DataFrame(oc_data, columns=ord_cols)
df = df.drop(columns=ord_cols)
df = pd.concat([df, oc_df], axis=1)

In [None]:
ohc = OneHotEncoder(sparse=False, drop='first')

ohc_cols = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']
ohc_data = ohc.fit_transform(df[ohc_cols].values)

ohc_df = pd.DataFrame(ohc_data, columns=ohc.get_feature_names())
df = df.drop(columns=ohc_cols, axis=1)
df = pd.concat([df, ohc_df], axis=1)

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
X = df.drop('Attrition', axis=1)
y = df['Attrition'].values

In [None]:
y.shape

Now it's time to use SMOTE method to balance the output classes, This is a simple implementation you can finetune this later

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
Xo, yo = oversample.fit_resample(X, y)

In [None]:
yo.shape

In [None]:
sns.countplot(x=yo)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xo, yo, test_size=0.2, random_state=41)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

In [None]:
print(X_train_sc.shape, X_test_sc.shape)

## First Model

## Simple ann model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

model.add(Dense(38, input_shape=(38,), activation='relu'))
model.add(Dense(19, activation='relu'))
model.add(Dense(9, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(x=X_train_sc, y=y_train, epochs=50, validation_data=(X_test_sc, y_test))

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()

In [None]:
pred = model.predict(X_test_sc)
pred = np.where(pred>0.5, 1, 0)
from sklearn.metrics import confusion_matrix, classification_report
c_m = confusion_matrix(y_test, pred)
print(c_m)
print(classification_report(y_test, pred))

## Model 2

## For this model, we are adding an earlystopping (it's not required, I am doing it for learning purpose)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

In [None]:
model_e = Sequential()

model_e.add(Dense(38, input_shape=(38,), activation='relu'))
model_e.add(Dense(19, activation='relu'))
model_e.add(Dense(9, activation='relu'))
model_e.add(Dense(1, activation='sigmoid'))

model_e.summary()

model_e.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model_e.fit(x=X_train_sc, y=y_train, 
          epochs=50, validation_data=(X_test_sc, y_test), callbacks=[early_stop])

In [None]:
model_loss = pd.DataFrame(model_e.history.history)
model_loss.plot()

In [None]:
pred = model_e.predict(X_test_sc)
pred = np.where(pred>0.5, 1, 0)
from sklearn.metrics import confusion_matrix, classification_report
c_m = confusion_matrix(y_test, pred)
print(c_m)
print(classification_report(y_test, pred))

### That's it from my end, This notebook was a part of deep learning journey

### PLzzzzzzz like / star if it's good