In [None]:
#basic packages
import pandas as pd 
import numpy as np 

#preprocessing libraries and algorithm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression

#visualization
import matplotlib.pyplot as plt
import seaborn as sns

#metrics

from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_curve,f1_score,auc,plot_roc_curve,confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
attr_df=pd.read_csv('/kaggle/input/employee-attrition/HR-Employee-Attrition.csv')

In [None]:
attr_df.shape

In [None]:
attr_df.info()

## EDA 

In [None]:
attr_df.isna().sum()

## Categorical data analysis

In [None]:
attr_df.select_dtypes('object').columns

In [None]:
attr_df['BusinessTravel'].value_counts()

In [None]:
plt.figure(figsize=(6,6))

In [None]:
def percentageAttritionPerCategory(df,featureColumn):
    print(f'{featureColumn}\n')
    print(df[featureColumn].describe(include='all'))
    if 'Attrition_Numerical' in df:
        print(f'\n Correlation :{df[[featureColumn,"Attrition_Numerical"]].corr()}')
    for category,groupeddata in df.groupby([featureColumn])['Attrition']:
        print(f'\n{category}\n')
        print(f'Attrition: Yes : {(len(groupeddata[groupeddata=="Yes"])/len(groupeddata))*100}')
        print(f'Attrition: No : {(len(groupeddata[groupeddata=="No"])/len(groupeddata))*100}')

In [None]:
def  generateFigures(df,featureColumn,hueColumn):   
    plt.figure(figsize = (12,8))
    ax = sns.countplot(x = featureColumn,hue =hueColumn,data=df)
    for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+13))

In [None]:
sns.countplot(x='BusinessTravel',data=attr_df,hue='Attrition')

In [None]:
attr_df.groupby(['BusinessTravel','Attrition'])['Attrition'].agg(lambda groupeddata : len(groupeddata[groupeddata=='Yes'])/len(groupeddata))

In [None]:
percentageAttritionPerCategory(attr_df,'BusinessTravel')

In [None]:
for category,groupeddata in attr_df.groupby(['BusinessTravel'])['Attrition']:
    print(category)
    print(len(groupeddata[groupeddata=='Yes'])/len(groupeddata))

In [None]:
sns.countplot(x='Department',data=attr_df,hue='Attrition')

In [None]:
percentageAttritionPerCategory(attr_df,'Department')
# comments people in research department are staying back 

In [None]:
sns.countplot(x='EducationField',data=attr_df,hue='Attrition')

In [None]:
percentageAttritionPerCategory(attr_df,'EducationField')
#Comments : People in life science & medical are not changing job frequently like other education field

In [None]:
sns.countplot(x='Gender',data=attr_df,hue='Attrition')

In [None]:
percentageAttritionPerCategory(attr_df,'Gender')
#Comments : No significant difference in the attribition

In [None]:
percentageAttritionPerCategory(attr_df,'JobRole')
#Comments : Attrition rate is very less for managers , healthcare representative , manufacturing director , research director

In [None]:
percentageAttritionPerCategory(attr_df,'MaritalStatus')
#Comments : Bachelors tend to move out quickly than others 

In [None]:
percentageAttritionPerCategory(attr_df,'Over18')
#Comments : Seems like only 1 value is there 

In [None]:
attr_df['Over18'].value_counts()
#comments , this field can be ignored as it has only one value 

In [None]:
percentageAttritionPerCategory(attr_df,'OverTime')
# People who works overtime move out so quickly . May be frustrate with job 

## Preprocessing

In [None]:
#processedData_df=PreprocessData(attr_df.copy(deep = True))

In [None]:
attr_df.info()

## Numerical data

In [None]:
attr_df['Attrition_Numerical']=attr_df['Attrition'].replace({'Yes':1,'No':0})

In [None]:
attr_df.info()

### Age

In [None]:
plt.figure(figsize=(10,15))
sns.countplot(x='Age',data=attr_df,hue='Attrition')

In [None]:
plt.figure(figsize=(20,15))
sns.countplot(x='Age',data=attr_df,hue='Attrition')
#Higher levels are handled by aged people
# Mid range age is 

### Daily rate

In [None]:
attr_df['DailyRate'].value_counts()

In [None]:
sns.histplot(x='DailyRate',data=attr_df)

### Education

In [None]:
sns.countplot(x='Education',hue='Attrition',data=attr_df )
# % Attrition across education level is same. Not a big contributor

In [None]:
sns.countplot(x='Education',hue='JobLevel',data=attr_df)

### Environment Satisfaction

In [None]:
sns.countplot(x='EnvironmentSatisfaction',hue='Attrition',data=attr_df)
# % attrition rate is compartively high for less satisfied people

### HourlyRate

In [None]:
attr_df['HourlyRate'].value_counts()

In [None]:
sns.scatterplot(x='HourlyRate',y='Attrition',data=attr_df)

### Job Involvement

In [None]:
sns.countplot(x='JobInvolvement',hue='Attrition',data=attr_df)

# Very evident that people with less job involvement are moving out quickly

### Job Level

In [None]:
sns.countplot(x='JobLevel',hue='Attrition',data=attr_df)
# Reduction in attrition with job level . People at lower job level tend to jump more 

### JobSatisfaction

In [None]:
sns.countplot(x='JobSatisfaction',hue='Attrition',data=attr_df)
# Job satisfaction has got a significant relation with attrition

In [None]:
sns.countplot(x='JobSatisfaction',hue='JobInvolvement',data=attr_df)

In [None]:
sns.barplot(x='JobSatisfaction',y='MonthlyIncome',data=attr_df)

### NumCompaniesWorked

In [None]:
sns.countplot(x='NumCompaniesWorked',hue='Attrition',data=attr_df)

### PerformanceRating

In [None]:
sns.countplot(x='PerformanceRating',hue='Attrition',data=attr_df)
# Percentage attrition does not seems to have much relation with attrition

### RelationshipSatisfaction

In [None]:
sns.countplot(x='RelationshipSatisfaction',hue='Attrition',data=attr_df)
# seeing increased attrition for people with less relationship satisfaction

### WorkLifeBalance

In [None]:
sns.countplot(x='WorkLifeBalance',hue='Attrition',data=attr_df)
# People with low work life tend to move out more than people with good work life balance 

### StockOptionLevel

In [None]:
sns.countplot(x='TotalWorkingYears',hue='Attrition',data=attr_df)

### TrainingTimesLastYear

In [None]:
sns.countplot(x='TrainingTimesLastYear',hue='JobInvolvement',data=attr_df)
# Not indicating a good pattern

In [None]:
full_list=['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'Attrition_Numerical'] 
#HourlyRate,MonthlyIncome,MonthlyRate

processed_list=['Age', 'DailyRate','Education','EmployeeCount','EnvironmentSatisfaction','HourlyRate','JobInvolvement','JobLevel',
               'JobSatisfaction', 'MonthlyIncome','MonthlyRate','PerformanceRating','RelationshipSatisfaction','StandardHours','EmployeeNumber','DistanceFromHome'] 

for index,feature in enumerate(attr_df.select_dtypes(np.number)):
    if(feature not in processed_list and len(attr_df[feature].value_counts())>10):
        percentageAttritionPerCategory(attr_df,feature)  

In [None]:
attr_df.corr()

In [None]:
print("#  FeatureName DistinctRecords")
for index,feature in enumerate(attr_df):
    print(f'{index}\t{feature}\t{attr_df[feature].dtype}\t{len(attr_df[feature].value_counts())}')

In [None]:
def PreprocessData(df):
    lbl_encoder=LabelEncoder()
    # Handling categorical data 
    if 'Attrition' in df and df['Attrition'].dtype=='object':
        df['Attrition']=df['Attrition'].replace({'Yes':1,'No':0})
        
    # for rest of the categorical fields
    df=pd.concat([df,pd.get_dummies(df[['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime']], drop_first=True)],axis=1)
    # dropping columns after dummies encoding
    df.drop(['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime'], axis=1,inplace=True)
       
    #dropping columns with only single value
    for feature in df:
        if(len(df[feature].value_counts())==1):
            df.drop([feature],inplace=True,axis=1)
    
    #Possible exclusions based on data analysis 
    #Low correlation
    if 'DailyRate' in df:
        df.drop(['DailyRate'],inplace=True,axis=1)
    
    if 'Over18' in df:
        df.drop(['Over18'],inplace=True,axis=1)
        
    if 'EmployeeNumber' in df:
        df.drop(['EmployeeNumber'],inplace=True,axis=1)
    if 'HourlyRate' in df:
        df.drop(['HourlyRate'],inplace=True,axis=1)
    if 'MonthlyRate' in df:
        df.drop(['MonthlyRate'],inplace=True,axis=1)
    if 'PerformanceRating' in df:
        df.drop(['PerformanceRating'],inplace=True,axis=1)   

    return df
    
    

In [None]:
preprocessed_df=PreprocessData(attr_df.copy(deep=True))

In [None]:
preprocessed_df.drop('Attrition_Numerical',inplace=True,axis=1)

In [None]:
preprocessed_df.describe()

In [None]:
preprocessed_df.corr()

In [None]:
preprocessed_y=preprocessed_df.pop('Attrition')

In [None]:
scaler=StandardScaler()
preprocessed_df=scaler.fit_transform(preprocessed_df)

In [None]:
train_x,test_x,train_y,test_y=train_test_split(preprocessed_df,preprocessed_y,test_size=.3,random_state=12)

In [None]:
train_x.shape

In [None]:
test_x.shape

In [None]:
lgr=LogisticRegression()

In [None]:
lgr.fit(train_x,train_y)

In [None]:
predicted_y=lgr.predict(test_x)

In [None]:
predicted_y

In [None]:
print(f"Accuracy : {accuracy_score(test_y,predicted_y)}")
print(f"Precision : {precision_score(test_y,predicted_y)}")
print(f"Recall : {recall_score(test_y,predicted_y)}")


In [None]:
fpr, tpr, threshold = roc_curve(test_y, predicted_y)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)

print(confusion_matrix(test_y, predicted_y))