In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score
from sklearn.metrics import classification_report
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from pandas_profiling import ProfileReport


warnings.filterwarnings('ignore')

def dist_box(df,col):
    fig,(ax1,ax2)=plt.subplots(2,1)
    sns.distplot(df[col],ax=ax1)
    sns.boxplot(df[col],ax=ax2)

df = pd.read_csv('/kaggle/input/hr-analytics/HR_comma_sep.csv')

df = df.rename(columns={'satisfaction_level': 'satisfaction', 
                        'last_evaluation': 'evaluation',
                        'number_project': 'projects',
                        'average_montly_hours': 'hours',
                        'time_spend_company': 'experience',
                        'Work_accident': 'accident',
                        'promotion_last_5years': 'promotion',
                        'Department' : 'department'
                        })

front = df['left']
df.drop(labels=['left'], axis=1,inplace = True)
df.insert(len(df.columns), 'left',front)

# Data Frame:

In [None]:
df.head()

# Data Shape & Types:

In [None]:
print(">>> Shape:\n" , df.shape)
print("\n>>> Features types:\n" , df.dtypes)


# Data Frame Description:

In [None]:
df.describe().T

# Left by Groups:

In [None]:
df.groupby('left').mean()

# Distributions:

In [None]:
plt.figure(figsize=(20,20), dpi = 300)
for i in range (0,len(df.columns)-1):
    plt.subplot(5,2,i+1)
    sns.histplot(df, x=df.columns[i], hue=df.left, bins=17, kde=True)

# Correlations on Heatmap:

In [None]:
plt.figure(figsize=(10,10))
p=sns.heatmap(df.corr(), annot=True,cmap ='RdYlGn')

# Profile Report:

In [None]:
ProfileReport(df)

# Total Churn Rate:

In [None]:
plt.figure(figsize=(9,9))
plot=sns.countplot(df['left'])
plt.xlabel('Target')
plt.ylabel('Count of target')
plt.title('Distribution of target')
total = len(df)
for p in plot.patches:
        plot.annotate('{:.1f}%'.format(100*p.get_height()/len(df)), (p.get_x()+0.33, p.get_height()+100))

# Churn Rate of Salary Levels:

In [None]:
plt.figure(figsize=(9,9))
plot=pd.crosstab(df.salary,df.left).plot(kind='bar',figsize=(9,9))
plt.xlabel('Salary level')
plt.ylabel('Count of leavers')
plt.title('Distribution of salary')
total = len(df)
for p in plot.patches:
         plot.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x(), p.get_height()+100))

# Churn Rate of Departments:

In [None]:
plt.figure(figsize=(9,9))
plot=pd.crosstab(df.department,df.left).plot(kind='bar',figsize=(9,9))
plt.xlabel('Department name')
plt.ylabel('Count of leavers')
plt.title('Distribution of departments')
total = len(df)
for p in plot.patches:
         plot.annotate('{:.1f}%'.format(100*p.get_height()/total), (p.get_x(), p.get_height()+50))

In [None]:
column=['satisfaction','evaluation','projects','hours','experience']
i=1

# BoxPlots:

In [None]:
plt.figure(figsize=(20,3))
for j in column:
    plt.subplot(1,5,i)
    sns.boxplot(df[j])
    i=i+1

In [None]:
#Categorical Encoding
#salary
df['salary'] = df['salary'].astype ('category')
df.salary = df.salary.cat.codes

#department
department_dummies = pd.get_dummies(df.department)
department_dummies = department_dummies.drop('sales', axis=1)
df.drop('department',axis=1,inplace=True)
df = pd.concat([df,department_dummies], axis='columns')


front = df['left']
df.drop(labels=['left'], axis=1,inplace = True)
df.insert(len(df.columns), 'left',front)

# Categorical Encoding:

In [None]:
df

In [None]:
# Defininh X and y
X = df.drop('left', axis=1).values
y = df.left

# Scaling
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit(X).transform(X.astype(float))

# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=7)

# Train/Test split:

In [None]:
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
#ML models
#DecisionTree
test_scores = []
train_scores = []
n_range = list(range(1,10))

for i in n_range:

    model = DecisionTreeClassifier(criterion="entropy", max_depth = i)
    model.fit(X_train,y_train)
    
    train_scores.append(model.score(X_train,y_train))
    test_scores.append(model.score(X_test,y_test))

# ML Models:
## 1- Decision Tree

In [None]:
plt.figure(figsize=(20,5))
p = sns.lineplot(n_range,train_scores,marker='*',label='Train Score')
p = sns.lineplot(n_range,test_scores,marker='o',label='Test Score')

In [None]:
dt_model = DecisionTreeClassifier(criterion="entropy", max_depth = 6).fit(X_train,y_train) 

In [None]:
#KNN
test_scores = []
train_scores = []
n_range = list(range(1,10))

for i in n_range:

    model = KNeighborsClassifier(i)
    model.fit(X_train,y_train)
    
    train_scores.append(model.score(X_train,y_train))
    test_scores.append(model.score(X_test,y_test))

## 2- KNN

In [None]:
plt.figure(figsize=(20,5))
p = sns.lineplot(n_range,train_scores,marker='*',label='Train Score')
p = sns.lineplot(n_range,test_scores,marker='o',label='Test Score')

In [None]:
knn_model = KNeighborsClassifier(1).fit(X_train,y_train)

## 3- Logistic Regression

In [None]:
lr_model = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)

## 4- Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=7).fit(X_train, y_train.ravel())

# Model Evaluation


In [None]:
print("Decision Tree's Accuracy: ", metrics.accuracy_score(y_test, dt_model.predict(X_test)))
print("KNN's Accuracy: ", metrics.accuracy_score(y_test, knn_model.predict(X_test)))
print("LR's Accuracy: ", metrics.accuracy_score(y_test, lr_model.predict(X_test)))
print("GB's Accuracy: ", metrics.accuracy_score(y_test, gb_model.predict(X_test)))

In [None]:
print("DT:", classification_report(y_test,  dt_model.predict(X_test)))
print("KNN:", classification_report(y_test,  knn_model.predict(X_test)))
print("LR:", classification_report(y_test, lr_model.predict(X_test)))
print("GB:", classification_report(y_test, gb_model.predict(X_test)))