In [None]:
import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
import seaborn as sns

In [None]:
# Read the data
df = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
# Print the data
df.head()

In [None]:
# Check for the summary statistics
df.describe()

In [None]:
# Plot the histogram
df.hist(bins=20,figsize=(20,20))

In [None]:
# Check for data types and missing values
df.info()

In [None]:
# Drop unnecessary columns
df.drop(["EmployeeNumber","EmployeeCount","StandardHours","Over18"],axis=1,inplace=True)

In [None]:
# Check for nulls
df.isnull().sum().plot.bar()

No missing values at all in the data!


In [None]:
# Plot Co-relation matrix
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(df.corr(), annot=True)

**Summary from correlation plot**
* Job level is strongly correlated with total working hours
* Monthly income is strongly correlated with Job level
* Monthly income is strongly correlated with total working hours
* Age is stongly correlated with monthly income

In [None]:
# Plots
plt.figure(figsize=[20,20])
plt.subplot(611)
sns.countplot(x='JobRole',hue='Attrition',data=df)
plt.subplot(612)
sns.countplot(x='HourlyRate',hue='Attrition',data=df)
plt.subplot(613)
sns.countplot(x='JobInvolvement',hue='Attrition',data=df)
plt.subplot(614)
sns.countplot(x='JobLevel',hue='Attrition',data=df)
plt.subplot(615)
sns.countplot(x='DistanceFromHome',hue='Attrition',data=df)
plt.subplot(616)
sns.countplot(x='Age',hue='Attrition',data=df)

**Summary from above plots**
* Half of sale representative staff tend to leave followed by lab technicians and research scientist
* The employees with JobInvolvemnet score 3 tend to quit
* Most of the staff in JobLevel 1 are likely to leave
* Employees between the age 26 to 33 have high tendency to leave


In [None]:
# Plot the count of Attrition
plt.figure(figsize=[12,12])
total = float(len(df)) 
ax=sns.countplot(df["Attrition"])
# set individual bar lables using above list
for i in ax.patches:
    # get_x pulls left or right; get_height pushes up or down
    ax.text(i.get_x()+0.3, i.get_height()+5,
        str(i.get_height()), fontsize=15,
    color='dimgrey')
        # get_x pulls left or right; get_height pushes up or down
    ax.text(i.get_x()+0.3, i.get_height()+35,
            '{:1.2f}%'.format(i.get_height()/total*100), fontsize=15,
    color='red')
show()


In [None]:
# What is the median salary of each job roles?

plt.figure(figsize=(15,10))
sns.boxplot(x=df.MonthlyIncome,y=df.JobRole)

**Summary:**

* Managers and Research directors are highly paid
* Sales Representative, Laboratory Technician and Research Scientist get paid pretty much the same.

In [None]:
# Does people with higher salary work longer and the vice versa?
plt.figure(figsize=(15,10))
sns.boxplot(y=df.JobRole,x=df.TotalWorkingYears)

**Summary:**

* Managers and Research directors are highly paid and stay for longer
* Sales Representative, Laboratory Technician and Research Scientist get paid pretty much the same and more likely to quit often

In [None]:
# Does years with current manager influence the employee to stay longer?
plt.figure(figsize=(15,10))
sns.boxplot(x=df.YearsWithCurrManager,y=df.YearsAtCompany)

**Summary**

Employees tend to stay longer in the company if they stay more years under the same manager

In [None]:
# label encode target variable 
df["Attrition"]=df["Attrition"].astype('category')
df["Attrition"] = df["Attrition"].cat.codes

In [None]:
df["Attrition"]

In [None]:
# encode all categorical columns
Obj_col = df.select_dtypes(include='object')
Obj_col

In [None]:
Obj_col.nunique()

Pandas as has inbuilt function "get_dummies" to get one hot encoding of that particular column/s.

In [None]:
# one line code for one-hot-encoding:
df_encoded=pd.get_dummies(df,columns=Obj_col.columns)
df_encoded.head()

In [None]:
X = df_encoded.loc[:,df_encoded.columns!="Attrition"]
y = df_encoded["Attrition"]
print(X.head())
print(y.head())

In [None]:
# Scaling

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X= scaler.fit_transform(X)
y=y.values.reshape(-1,1)
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model_lr= LogisticRegression()
model_lr.fit(X_train,y_train)

In [None]:
y_pred = model_lr.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score
print('Accuracy {} %'.format(100* accuracy_score(y_pred,y_test)))
#Getting predicted probabilities
y_score = model_lr.predict_proba(X_test)[:,1]
print('\nRoc value '+ str(roc_auc_score(y_test, y_score)))
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot =True,fmt='d')

In [None]:
print(classification_report(y_test,y_pred))

# Train and evaluate Random forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf= RandomForestClassifier()
model_rf.fit(X_train,y_train)

In [None]:
y_pred = model_rf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score
print('Accuracy {} %'.format(100* accuracy_score(y_pred,y_test)))
y_score = model_rf.predict_proba(X_test)[:,1]
print('\nRoc value '+ str(roc_auc_score(y_test, y_score)))
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot =True,fmt='d')

In [None]:
print(classification_report(y_test,y_pred))

# Train and evaluate Deep learning model

In [None]:
import tensorflow as tf
model= tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=500,activation='relu',input_shape= (51,)))
model.add(tf.keras.layers.Dense(units=500,activation='relu'))
model.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))


In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics='accuracy')

In [None]:
epochs_hist = model.fit(X_train,y_train,epochs=100,batch_size=25)

In [None]:
y_pred=model.predict(X_test)

In [None]:
y_pred = (y_pred>0.5)

In [None]:
plt.plot(epochs_hist.history['loss'])
plt.plot(epochs_hist.history['accuracy'])
plt.xlabel('Epochs')
plt.ylabel('percentage')
plt.legend(['loss','accuracy'])
plt.title('Loss and Accuracy plot')

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score
cm = confusion_matrix(y_test,y_pred)
print('Accuracy {} %'.format(100* accuracy_score(y_pred,y_test)))
print('\nRoc value '+ str(roc_auc_score(y_test, y_pred)))
sns.heatmap(cm,annot=True, fmt='d')

In [None]:
print(classification_report(y_test,y_pred))