In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix,plot_roc_curve,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC


pd.set_option('display.max_columns',None)#to make all columns visible

In [None]:
df = pd.read_csv('/kaggle/input/employee-attrition/HR-Employee-Attrition.csv')

In [None]:
df.head()

### Exploratory Data Analysis - EDA

In [None]:
df.info()
#looks like there is no null values in the data

In [None]:
df.isna().sum()

In [None]:
def normal(col):
    print('No of unique values:\t', df[col].nunique())
    print('Values Count:\n',df[col].value_counts())

In [None]:
def percent_cat(col):
    groupped = df.groupby([col])['Attrition'].value_counts()
    y=0
    for i in groupped.index:
        total_val = len(df[df[col]==i[0]])
        print(f'Percent of {i[1]} Employee Attrition of {i[0]} employee in column {col} is {round((groupped[i]/total_val)*100,2)}%')
        y+=1
        if y==2:
            print('\n')
            y=0

In [None]:
normal('Age')

**Observation**:
* Employees with age 35 were higher.

In [None]:
def insight(col):
    plt.figure(figsize = (16,8))
    ax = sns.countplot(x = df[col], hue = df['Attrition'])

    for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+13))

In [None]:
insight('BusinessTravel')

In [None]:
percent_cat('BusinessTravel')

**Observation**:
* Looks like employee who travel frequently changes their job.

In [None]:
normal('DailyRate')

In [None]:
insight('Department')

In [None]:
percent_cat('Department')

**Observation**
* R&D Department employees have lesser chance of job change.
* Sales Department employees have higher chance of job change.

In [None]:
normal('DistanceFromHome')

In [None]:
plt.figure(figsize = (16,10))
ax = sns.countplot(x = df['DistanceFromHome'], hue = df['Attrition'], palette='Accent')

for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()-0.01, p.get_height()+1))

In [None]:
percent_cat('DistanceFromHome')

In [None]:
df.groupby(['JobRole', 'Attrition']).agg({'DistanceFromHome':np.mean})

**Observation**
* People with less Distance from Home have the less chances of lead to employee attrition
* People with higher Distance from Home may lead to employee attrition 

In [None]:
insight('JobRole')

In [None]:
percent_cat('JobRole')

**Observation**
* From the above analysis Sales Representative employees have changed their job more
* Only lower percent of Research Director were not changed their job.

In [None]:
insight('Education')

In [None]:
percent_cat('Education')

**Observation**
* 3rd level and 1st level education category employee has higher possibility of employee attrition

In [None]:
insight('EducationField')

In [None]:
percent_cat('EducationField')

**Observation**
* Life Sciences and Medical education field employee has higher possibility of not changing their job.
* Human Resource and Technical Degree employees were changed their job in more amount.

In [None]:
normal('EmployeeCount')

**Observation**
* Employee count column has only one value in all rows, so probably we can drop this column.

In [None]:
normal('EmployeeNumber')

**Observation**
* Each row of Employee Number column has unique value, so we can drop this column during modelling

In [None]:
insight('EnvironmentSatisfaction')

In [None]:
percent_cat('EnvironmentSatisfaction')

**Observation**:
* Environment Satisfaction with value 3 & 4 have higher chance of not changing their job.
* Employees with environment satisfaction 1 have changed thir job more.

In [None]:
insight('Gender')

In [None]:
percent_cat('Gender')

**Observation**:
* There is no significant difference between male and female employees in changing their job
* More number of employees were male.

In [None]:
normal('HourlyRate')

In [None]:
plt.figure(figsize=(10,6))
sns.kdeplot(x = df['HourlyRate']) 
#Hourly rate follows normal distribution

In [None]:
insight('JobInvolvement')

In [None]:
percent_cat('JobInvolvement')

**Observations:**
* JobInvolvement level 1 employees have higher number of job change, whereas Job Involvement level 4 employees have lesser number of job change.

In [None]:
insight('JobSatisfaction')

In [None]:
percent_cat('JobSatisfaction')

**Observation**:
* Job Satisfaction level 1 employees have higher number of job change, whereas Job Satisfaction level 4 employees have lesser number of job change.

In [None]:
insight('MaritalStatus')

In [None]:
percent_cat('MaritalStatus')

**Observation**:
* Quite Interesting Bachelor employees changed their job in large amount.

In [None]:
normal('MonthlyIncome')

In [None]:
normal('MonthlyRate')

In [None]:
df.groupby(['Education','Attrition']).agg({'MonthlyIncome':np.mean}).head(30)

**Observation**:
* Education level 5 with no job change has higher average of Monthly Income.
* In all category education level, employees changed their job due to low monthly Income.

In [None]:
insight('NumCompaniesWorked')

In [None]:
percent_cat('NumCompaniesWorked')

**Observation**:
* Employees who have worked in less than 4 companies have lesser chance of changing their job.
* Higher the number of companies worked, higher the chance of changing their job.

In [None]:
normal('Over18')
#This column has only one unique column, so we can drop this column

In [None]:
insight('OverTime')

In [None]:
percent_cat('OverTime')

**Observation**:
* Employees who did overtime have higher chance of changing their job

In [None]:
insight('PercentSalaryHike')

In [None]:
percent_cat('PercentSalaryHike')

**Observation**
* Lower the salary hike of employees, higher the chance of job change.

In [None]:
insight('PerformanceRating')

In [None]:
percent_cat('PerformanceRating')

**Observation**
* There is no significant difference between Performance Rating level 3 and 4.

In [None]:
insight('RelationshipSatisfaction')

In [None]:
percent_cat('RelationshipSatisfaction')

**Observation:**
* Employees with Relationship Satisfaction level 1 changed their job in large amount.


In [None]:
normal('StandardHours')
#There is only one value in this column

In [None]:
insight('StockOptionLevel')

In [None]:
percent_cat('StockOptionLevel')

**Observation:**
* Employees who have had no stock option have changed their job in high percent.

In [None]:
groupped = df.groupby(['JobLevel'])['StockOptionLevel'].value_counts()
y=0
for i in groupped.index:
    total_val = len(df[df['JobLevel']==i[0]])
    print(f'Percent of Stock Option Level {i[1]} in joblevel {i[0]} is {round((groupped[i]/total_val)*100,2)}%')
    y+=1
    if y==4:
        print('\n')
        y=0

**Observation**
* From the above analysis Employees with job level 5 was given higher percent of Stock.

In [None]:
normal('JobLevel')

In [None]:
df.groupby(['JobLevel','StockOptionLevel'])['Attrition'].value_counts()

In [None]:
def group(col):
    groupped = df.groupby(col)['Attrition'].value_counts()
    return groupped

In [None]:
group('TotalWorkingYears')

In [None]:
percent_cat('TrainingTimesLastYear')

**Observation:**
* Employees who were given training recently have changed job in large amount.

In [None]:
percent_cat('WorkLifeBalance')

**Observation:**
* Employees with level 1 Work life balance have changed job in larger amount.

In [None]:
normal('YearsAtCompany')

In [None]:
percent_cat('YearsAtCompany')

**Observation**:
* Employees who joined company within one year have changed their job in higher percent.

In [None]:
percent_cat('YearsInCurrentRole')

**Observation**:
* Employees who are in current role less than 1 year in company has higher possibilities of changing their Job.

In [None]:
percent_cat('YearsSinceLastPromotion')

**Observation:**
* Employees who got promotion before 15 years have changed their job in higher percent.

In [None]:
normal('YearsWithCurrManager')

In [None]:
percent_cat('YearsWithCurrManager')

**Observation:**
* Employees who got in touch with new manager recently have changed their job in larger amount. Maybe due to the manager pressure.

### Feature Engineering

In [None]:
df['Attrition'] = np.where(df['Attrition'] == 'No', 0 ,1)

In [None]:
normal('Attrition')

**Observation**

* Looks like data is imbalanced

In [None]:
final_df = df.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis =1)
#Since these columns have unique values, I'm dropping these columns

In [None]:
final_df.head()

In [None]:
cat_columns = final_df.select_dtypes(exclude = np.number).columns

cat_columns

In [None]:
encoded_df  = pd.DataFrame(data = pd.get_dummies(df[cat_columns], drop_first=False),index=final_df.index)

encoded_df.head()

In [None]:
encoded_df.shape

In [None]:
final_df.drop(cat_columns,axis=1,inplace=True)
#Removing categorical columns
final_df.head()

In [None]:
final_concat_df = pd.concat([final_df,encoded_df], axis =1)
final_concat_df.head()

In [None]:
X = final_concat_df.drop('Attrition', axis =1)
y = final_concat_df['Attrition']

#### Scaling

In [None]:
sc = StandardScaler()

sc.fit(X)

In [None]:
X_scaled = pd.DataFrame(data = sc.transform(X), columns = X.columns)

X_scaled.head()

#### spliting train and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=123)

In [None]:
y_train.value_counts()

In [None]:
def metrics(y_true,y_pred):
    print('Confusion Matrix:\n', confusion_matrix(y_true, y_pred))
    print('\n\nAccuracy Score:\n', accuracy_score(y_true, y_pred))
    print('\n\nClassification Report: \n', classification_report(y_true, y_pred))

#### Modelling

In [None]:
lg = LogisticRegression()

lg.fit(X_train, y_train)

In [None]:
train_pred = lg.predict(X_train)

test_pred = lg.predict(X_test)

In [None]:
print('Train Metrics')
metrics(y_train, train_pred)

In [None]:
print('Test Metrics')
metrics(y_test, test_pred)

In [None]:
plot_roc_curve(lg, X_train, y_train)

#### KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)

knn.fit(X_train, y_train)

In [None]:
train_pred_knn = knn.predict(X_train)

test_pred_knn = knn.predict(X_test)

In [None]:
print('Train Metrics')

metrics(y_train, train_pred_knn)

In [None]:
print('Test Metrics')

metrics(y_test, test_pred_knn)

In [None]:
plot_roc_curve(knn, X_train, y_train)

In [None]:
error_rate = []

# Will take some time
for i in range(1,40):
    
    knn = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)
    knn.fit(X_train,y_train)
    pred_i = knn.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [None]:
# n_neighbours 9 has lower error rate for test value, we can choose that

In [None]:
knn = KNeighborsClassifier(n_neighbors=9,n_jobs=-1)

knn.fit(X_train, y_train)

In [None]:
train_pred_knn = knn.predict(X_train)

test_pred_knn = knn.predict(X_test)

In [None]:
print('Train Metrics')

metrics(y_train, train_pred_knn)

In [None]:
print('Test Metrics')

metrics(y_test, test_pred_knn)

#### Naive Bayes

In [None]:
gb = GaussianNB()

gb.fit(X_train, y_train)

In [None]:
train_pred_gb = gb.predict(X_train)

test_pred_gb = gb.predict(X_test)

In [None]:
print('Train Metrics')

metrics(y_train, train_pred_gb)

In [None]:
print('Test Metrics')

metrics(y_test, test_pred_gb)

####  SMote 

In [None]:
sm = SMOTETomek(sampling_strategy=0.5,n_jobs=-1)

In [None]:
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print(f'The Number of class before the fit \n {y_train.value_counts()}')

print(f'The Number of class after the fit \n {y_train_sm.value_counts()}')

In [None]:
X_train.shape

In [None]:
X_train_sm.shape

In [None]:
lg = LogisticRegression(max_iter=100)

lg.fit(X_train_sm, y_train_sm)

In [None]:
train_pred = lg.predict(X_train_sm)

test_pred = lg.predict(X_test)

In [None]:
print('Test Metrics')

metrics(y_test, test_pred)

In [None]:
print('Train Metrics')
metrics(y_train_sm,train_pred)

### Conclusion:
* Logistics Regression model gave better result.
* SMOTE implemented Logistics model gave a slight improvement  Recall and Precision Score

### Decision Tree

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train, y_train)

train_pred_tr = dtree.predict(X_train)

test_pred_tr = dtree.predict(X_test)

metrics(y_test,test_pred_tr)

In [None]:
metrics(y_train, train_pred_tr)

#Decision Tree Results in Over fitting

### Bagging 

In [None]:
bag = BaggingClassifier(base_estimator=dtree)

In [None]:
bag.fit(X_train, y_train)

train_pred_bag = bag.predict(X_train)

test_pred_bag = bag.predict(X_test)

metrics(y_test,test_pred_bag)

In [None]:
metrics(y_train, train_pred_bag)

### Random Forest

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)

train_pred_rf = rf.predict(X_train)

test_pred_rf = rf.predict(X_test)

metrics(y_test,test_pred_rf)

### ADABoost

In [None]:
ada = AdaBoostClassifier()

In [None]:
ada.fit(X_train, y_train)

train_pred_ada = ada.predict(X_train)

test_pred_ada = ada.predict(X_test)

metrics(y_test,test_pred_ada)

#### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier()

In [None]:
gb.fit(X_train, y_train)

train_pred_gb = gb.predict(X_train)

test_pred_gb = gb.predict(X_test)

metrics(y_test,test_pred_gb)

#### XGBoost

In [None]:
xgb = XGBClassifier()

In [None]:
xgb.fit(X_train, y_train)

train_pred_xgb = xgb.predict(X_train)

test_pred_xgb = xgb.predict(X_test)

metrics(y_test,test_pred_xgb)

In [None]:
params = {
    'learning_rate' : [0.02,0.05, 0.08],
    'max_depth' : [3, 4, 5, 6, 8],
    'min_child_weight': [1, 3, 5],
    'gamma' : [0.0,0.1,0.2], #less than 1 make sure
    'colsample_bytree':[0.3,0.4,0.5] #less than 1
}

In [None]:
r_xgb = RandomizedSearchCV(xgb, params, n_jobs=-1, verbose=3)

In [None]:
r_xgb.fit(X_train, y_train)

In [None]:
r_xgb.best_params_

In [None]:
best_xgb = XGBClassifier(min_child_weight = 5, max_depth = 4, learning_rate = 0.08, gamma =0.0, colsample_bytree = 0.3)

In [None]:
best_xgb.fit(X_train, y_train)

train_pred_xgb = best_xgb.predict(X_train)

test_pred_xgb = best_xgb.predict(X_test)

metrics(y_test,test_pred_xgb)

#### Light GBM

In [None]:
lgm = LGBMClassifier()

In [None]:
lgm.fit(X_train, y_train)

train_pred_lgm = lgm.predict(X_train)

test_pred_lgm = lgm.predict(X_test)

metrics(y_test,test_pred_lgm)

#### CAT Boost

In [None]:
cat = CatBoostClassifier()

In [None]:
cat.fit(X_train, y_train)

train_pred_cat = cat.predict(X_train)

test_pred_cat = cat.predict(X_test)

metrics(y_test,test_pred_cat)

In [None]:
params_cat = {
    'learning_rate' : [0.02,0.05, 0.07],
    'max_depth' : [3, 4, 5, 6, 8],
    'min_child_samples': [1, 3, 5],
    'l2_leaf_reg':[5,10,15]
}

In [None]:
r_cat = RandomizedSearchCV(cat, params_cat, n_jobs=-1, verbose=1, cv=3)

In [None]:
r_cat.fit(X_train,y_train)

In [None]:
r_cat.best_params_

In [None]:
best_cat = CatBoostClassifier(min_child_samples=1, max_depth=5, learning_rate = 0.05, l2_leaf_reg = 15)

In [None]:
best_cat.fit(X_train, y_train)

train_pred_cat = best_cat.predict(X_train)

test_pred_cat = best_cat.predict(X_test)

metrics(y_test,test_pred_cat)

### SVM

In [None]:
svm = SVC()

In [None]:
svm.fit(X_train, y_train)

train_pred_svm = svm.predict(X_train)

test_pred_svm = svm.predict(X_test)

metrics(y_test,test_pred_svm)

In [None]:
c = [1, 0.25, 0.5, 0.75]
kernels = ['linear', 'rbf']
gammas = ['auto', 0.01, 0.001, 1]

In [None]:
grid_svm = GridSearchCV(estimator=svm, param_grid=dict(kernel=kernels, C=c, gamma = gammas), cv=3, verbose=3, n_jobs=-1)

In [None]:
grid_svm.fit(X_train, y_train)

In [None]:
grid_svm.best_score_

In [None]:
test_pred_grid = grid_svm.predict(X_test)

metrics(y_test,test_pred_grid)