## Problem statement:

## Importing libraries: 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Importing dataset: 

In [2]:
data= pd.read_csv('HR-Employee-Attrition.csv')
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


## Domain analysis: 

## Basic checks: 

In [None]:
data.head()

In [None]:
data.head(pd.set_option('display.max_columns', None)) # to display all the columns

In [None]:
data.info()

In [None]:
# no null values, both categorical and numerical columns (total 35 columns)

In [None]:
data.describe().T

#### Insights:
- std of employeecount is 0 hence constant feature
- employee number is a unique feature hence irrelevant.
- standard hours is a constant feature.

In [None]:
data.describe(include='O')

In [None]:
# over18 is a constant feature

## Exploratory data analysis:

### Univariate analysis: 

In [None]:
# automated EDA libraries make the visualization easier 
# examples: sweetviz, autoviz (for bivariate analysis), pandas profiling etc

In [None]:
import sweetviz as sv

report= sv.analyze(data)
report.show_html()

#### Insights: (generate many meaningful insights as much as possible )
- Age: People between the age group of 25-45 are the majority.
- Business travel: 70% people travel rarely, 20% people travel frequently and the rest do not travel.
- Daily Rate: uniform distribution
- Deprtment: Around 65% are in R&D, 30% in sales.
- Distance from home: around 40% are nearer to the office(i.e., distance from their home is less than or equal to 10.0)
- Education: around 60% people have educational qualification of 3 and 4.
- Education field:  around 40% are from life sciences and 30% are from medical science. 
- Environment satisfaction: around 60% people are almost satisfied with the environmental condition of the office with the ratings of more than 3.
- Gender: 60% male, 40% female
- Hourly rate: uniform distribution
- Job Involvement:around 60% have full job involvement and around 25% people have partial involvement.
- Job satisfaction: More than 60% employees seem to be satisfied with their job.
- Marital status: around 50% people are married, 30% single and the rest are divorced.
- Num companies worked:40% employees has worked in less than 1 company so they are freshers,
30% people have worked for more than five companies.


### Bivariate analysis:

#### Categorical features w.r.t target variable


In [None]:
data_cat= data[['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']]

In [None]:
plt.figure(figsize=(50,50), facecolor='white')
plotnumber=1

for column in data_cat:
    if plotnumber <=8:
        ax= plt.subplot(4,2,plotnumber)
        sns.countplot(data= data_cat, x= column, hue= data['Attrition'])
        plt.xlabel(column, fontsize = 40)
        plt.ylabel('count', fontsize = 40)
    plotnumber +=1
plt.tight_layout()

#### Insights:
- people who travel frequently are more likely to quit the job.
- people who don't do overtime have less probability of leaving the job.
- singles are more likely to quit the job.
- people in sales department are more likely to quit.

#### Discrete features w.r.t  target variable.

In [None]:
data_dis = data[['Education','EnvironmentSatisfaction', 'JobInvolvement','JobLevel','JobSatisfaction','NumCompaniesWorked','PerformanceRating','RelationshipSatisfaction','StockOptionLevel','TrainingTimesLastYear','WorkLifeBalance']]

In [None]:
plt.figure(figsize=(50,50), facecolor = 'white')
plotnumber= 1

for column in data_dis:
    if plotnumber <=12:
        ax = plt.subplot(4,3,plotnumber)
        sns.countplot(data= data_dis, x= column, hue= data.Attrition)
        plt.xlabel(column, fontsize= 40)
        plt.ylabel('count', fontsize =40)
    plotnumber +=1
plt.tight_layout()

#### Insights:
- People with education level of 1 are more likely to quit.
- Employees with the environment satisfaction of ratings 1 are more probably quitting.
- Job invlovement, Job level, Job satisfaction with level 1 are most liekly to quit.
- People who have worked for 5,6 or 9 companies are likely to quit than the rest.
- Stock option level 0f less than 1 make the employees more likely to quit the job.
- If there are no trainings given at all for the last year and 0 ratings of work life balance enable more employees to quit.

#### Continuous variables w. r. t target variable 

In [None]:
data.head(2)

In [None]:
# age, daily rate, distance from home, hourly rate, monthly income, monthly rate, percent salary hike, 
# total working years, years at company, years in current role, years since last promotion,
# years with curr mmanager

In [None]:
data_c= data[['Age','DailyRate','DistanceFromHome','HourlyRate','MonthlyIncome','MonthlyRate','PercentSalaryHike','TotalWorkingYears','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']]

In [None]:
plt.figure(figsize = (50, 70), facecolor = 'white')
plotnumber = 1

for column in data_c:
    if plotnumber <= 12:
        ax = plt.subplot(6, 2, plotnumber)
        sns.histplot(data= data_c, x = column, hue = data['Attrition'])
        plt.xlabel(column, fontsize = 40)
        plt.ylabel('Attrition', fontsize = 40)
    plotnumber +=1
plt.tight_layout()

#### Insights:
- People around the age of 20-25 are more likely to quit the job.
- features like daily rate, hourly rate, monthly rate are uniformly distributed hence 
not useful in deriving relationship with the output variable.
- People with monthly income of less than 2500 are more probable to leave the job.
- Employees with total working years of less than 2 are more likely to quit.
- If the years at the company is around 0-2, they are more likely to quit.


### final conclusions:


## Data preprocessing: 

### Checking null values: 

In [None]:
data.isnull().sum()

### Encoding: 

In [None]:
# for loop to fetch out only categorical columns

categorical_col =[]

for column in data.columns:
    if data[column].dtype == object and len(data[column].unique())<=50:
        categorical_col.append(column)
        print(f'{column} : {data[column].unique()}')
        print('=================')

In [None]:
# if u know the domain knowledge, mapping becomes useful to assign values based on the importance 
# of the different categories in a column.

# discussing with the client to get to know about the significance of each category in a column is 
# crucial to map the values .

In [None]:
# 1. attrition

data['Attrition']= data['Attrition'].map({'Yes': 1, 'No': 0})

In [None]:
# If there is no domain knowledge, it is better to map the values in the same order as given.

In [None]:
# 2. BusinessTravel

data['BusinessTravel'] = data['BusinessTravel'].map({'Travel_Rarely':0, 'Travel_Frequently': 1, 'Non-Travel': 2})

In [None]:
# 3. Department

data['Department'] = data['Department'].map({'Sales':0, "Research & Development":1,'Human Resources':2 })

In [None]:
#4. EducationField 

data['EducationField'] = data['EducationField'].map({'Life Sciences':0, 'Other':1, 'Medical':2, 'Marketing':3, 'Technical Degree':4, 'Human Resources':5})

In [None]:
# 5. Gender

# Since gender is a nominal data with equal importance to both the categories, 
# one hot encoder can be used.

data['Gender'] = pd.get_dummies(data['Gender'], drop_first = True)

In [None]:
# 6. JobRole 

data['JobRole'] = data['JobRole'].map({'Sales Executive':0, 'Research Scientist':1, 'Laboratory Technician':2, 'Manufacturing Director':3, 'Healthcare Representative':4, 'Manager':5, 'Sales Representative':6, 'Research Director':7,'Human Resources':8})

In [None]:
data['MaritalStatus'].value_counts()

In [None]:
# can check the weightage of each category and assign the values as per the importance.

In [None]:
# 7. MaritalStatus 

data['MaritalStatus'] = data['MaritalStatus'].map({'Single':0, 'Married':1, 'Divorced':2})

In [None]:
# 8. OverTime : ['Yes' 'No']

# label encoder is used here.

from sklearn.preprocessing import LabelEncoder
label= LabelEncoder()

data['OverTime'] = label.fit_transform(data['OverTime'])

### checking the duplicate records: 

In [None]:
data.duplicated().sum()

### Scaling and outliers handling are not required in decision tree 

### Feature Selection:


In [None]:
# fetching only numerical columns for correlation

data_c

In [None]:
# look for both negative correlation nd positive correlation

In [None]:
plt.figure(figsize= (30, 30))
sns.heatmap(data_c.corr(), annot = True, cmap = 'coolwarm', annot_kws ={'size':15})

In [None]:
# years at company and years with curr manager
# years at company and years in current role
# total working years and monthly income 

# all the above combination of features are showing little high positive correlation but not very high
correlation.
# hence no features need be dropped here.

In [None]:
# finding the features with std = 0 and removing them
# if std = 0, then there is not even slight variation in the distribution of data, hence all the 
# values are the same . Hence, it is a constant feature.

zero_std=[]

for column in data.columns:
    if data[column].dtype == 'int64':
        if np.std(data[column]) == 0:
            zero_std.append(column)
print(zero_std)

In [None]:
# features to be removed:
# 'EmployeeCount', 'StandardHours'- constant features
# 'EmployeeNumber'- unique feature
# 'Over18'- irrelevant since only one unique category

In [None]:
# dropping irrelevant columns

data.drop(['EmployeeCount', 'StandardHours', 'EmployeeNumber', 'Over18'],axis =1, inplace = True)

### Balancing the dataset: 

#### Splitting the data:


In [None]:
X= data.drop('Attrition', axis = 1)
y = data.Attrition

In [None]:
y.value_counts() # target variable is imbalanced

In [None]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(X, y, test_size= 0.20, random_state = 14)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_train_sm, y_train_sm = smote.fit_resample(x_train, y_train)

In [None]:
from collections import Counter

print(Counter(y_train))
print(Counter(y_train_sm))

### Model Creation: 

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()

model_dt.fit(x_train_sm, y_train_sm)

In [None]:
# prediction

y_pred_dt = model_dt.predict(x_test)

In [None]:
# evaluation

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy_score(y_test, y_pred_dt)

In [None]:
confusion_matrix(y_test, y_pred_dt)

In [None]:
print(classification_report(y_test, y_pred_dt))

In [None]:
# training accuracy

y_train_acc_dt = model_dt.predict(x_train_sm)
accuracy_score(y_train_sm, y_train_acc_dt)

### Hyperparameter tuning:

In [None]:
# so hyperparameter tuning is done and and the best parameters has to be found out.

from sklearn.model_selection import GridSearchCV

params= {
    'criterion': ('gini', 'entropy'),
    'max_depth': list(range(1,20)), 
    'min_samples_split': [2,3,4],
    'min_samples_leaf': list(range(1,20))
}
model = DecisionTreeClassifier(random_state = 3)
model_for_cv = GridSearchCV(model, params, scoring= 'f1', verbose = 1, n_jobs = -1, cv=3)

model_for_cv.fit(x_train_sm, y_train_sm)

best_params_= model_for_cv.best_params_
print(f'Best parameters: {best_params_}')

#### Applying the best parameters to the model after hyperparameter tuning: 

In [None]:
model_dt1 = DecisionTreeClassifier(criterion='gini', max_depth =10, min_samples_leaf =2, min_samples_split = 2)

model_dt1.fit(x_train_sm, y_train_sm)

In [None]:
# prediction

y_pred_dt1 = model_dt1.predict(x_test)


In [None]:
# evaluation

accuracy_score(y_pred_dt1, y_test)

In [None]:
confusion_matrix(y_pred_dt1, y_test)

In [None]:
print(classification_report(y_pred_dt1, y_test))

In [None]:
# training accuracy

y_train_acc_dt1 = model_dt1.predict(x_train_sm)
accuracy_score(y_train_acc_dt1, y_train_sm)

In [None]:
# training accuracy is slightly reduced after hyperparameter tuning.

In [97]:
plt.figure(figsize=(12, 12))
tree.plot_tree(model_dt)
plt.show()

NameError: name 'tree' is not defined

<Figure size 1200x1200 with 0 Axes>