In [None]:
# importing and file list
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import datetime, time
from datetime import date
import holidays

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Feature Extraction with In-time and Out-Time data 

- [x] number of days off work
- [x] number of late arrivals for work
- [x] number of early arrivals for work
- [x] number of late departures from work
- [x] number of early departures from work
- [x] mean working hours

In [None]:
df = pd.read_csv("/kaggle/input/hr-analytics-case-study/in_time.csv")
df.rename(columns={"Unnamed: 0":"EmployeeID"},inplace=True)

us_holidays = holidays.US()

holiday_list = []
for day in df.columns[1:]:
    if (day in us_holidays):
        holiday_list.append(day)

tmp = df.isna().sum().sort_values(ascending=False)
tmp = tmp[tmp == 4410].index.tolist()

holiday_list = tmp + list(set(holiday_list) - set(tmp))

df = df.drop(holiday_list,axis=1)

# Datetime tipine convert edildi.
df.iloc[:,1:] = df.iloc[:,1:].apply(pd.to_datetime)

late_entering = df.iloc[:,1:].T.apply(lambda dt : dt.dt.time > time(10,1,1)).sum()
early_entering = df.iloc[:,1:].T.apply(lambda dt : dt.dt.time < time(10,1,1)).sum()

df["late_entering"] = late_entering
df["early_entering"] = early_entering

# Gelmediği günlerin sayısı
df["not_come"] = df.T.isna().sum()

df_in = df.iloc[:,1:-3].T
df_intime = df[["EmployeeID","late_entering","early_entering","not_come"]]
df.head(3)

In [None]:
# Sabah başlama Saati belirlenmesi
df["2015-01-05"].hist(figsize=(10,5))

In [None]:
df = pd.read_csv("/kaggle/input/hr-analytics-case-study/out_time.csv")
df.rename(columns={"Unnamed: 0":"EmployeeID"},inplace=True)

us_holidays = holidays.US()

holiday_list = []
for day in df.columns[1:]:
    if (day in us_holidays):
        holiday_list.append(day)
len(holiday_list)
tmp = df.isna().sum().sort_values(ascending=False)
tmp = tmp[tmp == 4410].index.tolist()

holiday_list = tmp + list(set(holiday_list) - set(tmp))

df = df.drop(holiday_list,axis=1)

# Datetime tipine convert edildi.
df.iloc[:,1:] = df.iloc[:,1:].apply(pd.to_datetime)

late_exit = df.iloc[:,1:].T.apply(lambda dt : dt.dt.time > time(17,1,1)).sum()
early_exit = df.iloc[:,1:].T.apply(lambda dt : dt.dt.time < time(17,1,1)).sum()
df["late_exit"] = late_exit
df["early_exit"] = early_exit

df_out = df.iloc[:,1:-3].T
df_outtime = df[["EmployeeID","late_exit","early_exit"]]
df.head(3)

In [None]:
# Calisma Saati Özellik çıkarımı
df_cal = (df_out - df_in).mean()
df_intime["working_hours"] = df_cal.apply( lambda x : x / np.timedelta64(1, 'h') )

In [None]:
# Bitiş Saati belirlenmesi
df["2015-01-05"].hist(figsize=(10,5))

In [None]:
general_data=pd.read_csv('/kaggle/input/hr-analytics-case-study/general_data.csv')
employee_survey_data=pd.read_csv('/kaggle/input/hr-analytics-case-study/employee_survey_data.csv')
manager_survey_data=pd.read_csv('/kaggle/input/hr-analytics-case-study/manager_survey_data.csv')

general_data.set_index('EmployeeID', inplace=True)
employee_survey_data.set_index('EmployeeID', inplace=True)
manager_survey_data.set_index('EmployeeID', inplace=True)
df_intime.set_index("EmployeeID", inplace=True)
df_outtime.set_index("EmployeeID", inplace=True)

data=pd.concat([general_data,employee_survey_data,manager_survey_data,df_intime,df_outtime],axis=1)
data

In [None]:
 data.info()

# Missing Values

In [None]:
data.isna().sum()

In [None]:
data.loc[data["NumCompaniesWorked"].isna(),"NumCompaniesWorked"] = 0
data.loc[data["NumCompaniesWorked"].isna()]

In [None]:
data.loc[data["TotalWorkingYears"].isna()]
data.loc[data["TotalWorkingYears"].isna(),"TotalWorkingYears"] = 0

sns.histplot(data, x= "TotalWorkingYears")

In [None]:
data["EnvironmentSatisfaction"].hist()
data.loc[data["EnvironmentSatisfaction"].isna(),"EnvironmentSatisfaction"] = 3

In [None]:
data["JobSatisfaction"].hist()
data.loc[data["JobSatisfaction"].isna(),"JobSatisfaction"] = 3

In [None]:

data["WorkLifeBalance"].hist()
data.loc[data["WorkLifeBalance"].isna(),"WorkLifeBalance"] = 3

In [None]:
data.info()

# Exploration Data Analysis

In [None]:
# TO-DO will refactor this func
def drw_per(ax):
    bars = ax.patches
    half = int(len(bars)/2)
    left_bars = bars[:half]
    right_bars = bars[half:]

    for left, right in zip(left_bars, right_bars):
        height_l = left.get_height()
        height_r = right.get_height()
        total = height_l + height_r

        ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
        ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

### BusinessTravel

In [None]:
data.groupby("BusinessTravel").groups.keys()

#### Observation

- Most employees who left is who travel rarely


In [None]:
plt.figure(figsize=(8,6))
df = data[data["Attrition"] == "Yes" ]

ax = sns.countplot(x='BusinessTravel', data= df, hue="Attrition")


### Department

In [None]:
data.groupby("Department").groups.keys()

#### Observation

- Most employees who left worked in Research & Development

In [None]:
plt.figure(figsize=(6,4))
df = data[data["Attrition"] == "Yes"]
ax = sns.countplot(x='Department', data=df, hue="Attrition")


In [None]:
data.columns

# DistanceFromHome

In [None]:
data["DistanceFromHome"].describe()

#### Observation

- Most employees who left is near office

In [None]:
plt.figure(figsize=(8,6))
ax = sns.histplot(x='DistanceFromHome', data=data[ data["Attrition"] == "Yes"])


### Education

In [None]:
data[["Education"]].value_counts()

#### Observation

- Most employees who left has 3, 4 education level
- Least employees who left has 5 education level

In [None]:
plt.figure(figsize=(8,6))
ax = sns.countplot(x='Education', data=data[ data["Attrition"] == "Yes"], hue="Attrition")

In [None]:
df = data.groupby("Education")
df.groups.keys()

In [None]:
plt.figure(figsize=(20,10))
n_cols = 3
n_rows = 2
i = 1
for key, grp in df:
    plt.subplot(n_rows,n_cols,i)
    i = i + 1
    ax = sns.countplot(x='Education', data=grp, hue="EducationField")
    #drw_per(ax)

In [None]:
data.columns

### Gender

#### Observation

- Most employees who left is male

In [None]:
plt.figure(figsize=(4,3))
ax = sns.countplot(x='Gender', data=data[ data["Attrition"] == "Yes"], hue="Attrition")
drw_per(ax)

In [None]:
data.columns

In [None]:
data["JobLevel"].value_counts()

In [None]:
data["JobRole"].value_counts()

In [None]:
df = data.groupby("JobRole")
print(len(df.groups.keys()))
df.groups.keys()

#### Observation

- Most employees roles who left  is Research_Scientist, Sales Executive, Laboratory Technician

In [None]:
plt.figure(figsize=(16,8))

ax = sns.countplot(x='JobRole', data=data[ data["Attrition"] == "Yes"], hue="Attrition")
#drw_per(ax)

In [None]:
data.columns

#### Observation

- Most employees who left is Single, also married people are not less

In [None]:
plt.figure(figsize=(8,6))
ax = sns.countplot(x='MaritalStatus', data=data[ data["Attrition"] == "Yes"], hue="Attrition")

### MonthlyIncome

In [None]:
data["MonthlyIncome"].describe()

In [None]:
plt.subplot(1,2,1)
sns.histplot(x='MonthlyIncome', data=data.loc[data["Attrition"] == "Yes"])

plt.subplot(1,2,2)
sns.histplot(x='MonthlyIncome', data=data.loc[data["Attrition"] == "No"])

In [None]:
data.columns

### NumCompaniesWorked

In [None]:
data["NumCompaniesWorked"].value_counts()

In [None]:
df = data.groupby("NumCompaniesWorked")
df.groups.keys()

#### Observation

- Most employees who left worked for 1 year

In [None]:
plt.figure(figsize=(8,4))
ax = sns.countplot(x='NumCompaniesWorked', data= data[ data["Attrition"] == "Yes"], hue="Attrition")

In [None]:
data.columns

### PercentSalaryHike

#### Observation

- Most employees who left worked for 1 year

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='PercentSalaryHike',data= data[ data["Attrition"] == "Yes"], hue="Attrition")

### TotalWorkingYears

In [None]:
data["TotalWorkingYears"].value_counts()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='TotalWorkingYears',data= data[ data["Attrition"] == "Yes"], hue="Attrition")

In [None]:
data.columns

### TrainingTimesLastYear

In [None]:
data["TrainingTimesLastYear"].value_counts()

In [None]:
plt.figure(figsize=(8,4))
ax = sns.countplot(x="TrainingTimesLastYear", data= data[ data["Attrition"] == "Yes"] , hue="Attrition")

### YearsAtCompany

In [None]:
data["YearsAtCompany"].describe()

#### Observation

- Most employees who left worked for 1 year at company

In [None]:
plt.figure(figsize=(16,4))
ax = sns.countplot( x= "YearsAtCompany", data=data[data["Attrition"] == "Yes"],hue="Attrition")

In [None]:
data.columns

### YearsWithCurrManager

In [None]:
data["YearsWithCurrManager"].value_counts()

In [None]:
plt.figure(figsize=(16,4))
ax = sns.countplot( x= "YearsWithCurrManager", data=data[data["Attrition"] == "Yes"], hue="Attrition")

### EnvironmentSatisfaction & JobSatisfaction &  WorkLifeBalance

#### Observation

- Employees who left environments satisfaction and Job Satisfaction varies.
- Employees who left has normal Work Life Balance  

In [None]:
plt.figure(figsize=(16,4))
df = data[ data["Attrition"] == "Yes"]
plt.subplot(1,3,1)
ax = sns.countplot( x= "EnvironmentSatisfaction", data=df,hue="Attrition")

plt.subplot(1,3,2)
ax = sns.countplot( x= "JobSatisfaction", data=df,hue="Attrition")

plt.subplot(1,3,3)
ax = sns.countplot( x= "WorkLifeBalance", data=df,hue="Attrition")


### JobInvolvement & PerformanceRating

In [None]:
plt.figure(figsize=(16,8))
df = data[ data["Attrition"] == "Yes"]

plt.subplot(1,3,1)
ax = sns.countplot( x= "JobInvolvement", data=df,hue="Attrition")

plt.subplot(1,3,2)
ax = sns.countplot( x= "PerformanceRating", data=df,hue="Attrition")



# Training

### Imports

In [None]:
# imports 
from scipy.stats import sem
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate,RepeatedKFold,RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix,roc_auc_score,plot_roc_curve
from sklearn import datasets, metrics, model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

### Preprocessing for Traning

In [None]:
data.replace({'Yes':1,'No':0},inplace=True)
X = data.drop(['Attrition','Over18','StandardHours','EmployeeCount'],axis=1)
Y = data['Attrition']
X = pd.get_dummies(X)

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2, shuffle= True)

## Utils Funcs

In [None]:
def get_classifiers_models_scores_with_CV(X,Y,n_split=5, n_repeats = 5):
    classifiers = [
    LogisticRegression(C = 0.1, penalty= 'l2', solver= 'newton-cg'), # This parameters found with above hyperparameter optimizatiob
    XGBClassifier(objective="binary:logistic"),
    CatBoostClassifier(verbose=0),
    LGBMClassifier(),
    SVC()
    ]
    estimators = []
    name = []
    acc = []
    recall = []
    precision = []
    f1 = []
    roc_auc = []
   
    se = []
    cv = RepeatedStratifiedKFold(n_splits=n_split, n_repeats=n_repeats, random_state= 42)
    scoring = ["roc_auc","accuracy","recall","precision","f1"]

    for clf in classifiers:
        
        name.append(type(clf).__name__)

        scores = cross_validate(clf,X,Y,cv=cv, scoring=scoring) #, n_jobs=-1

        acc.append(scores["test_accuracy"].mean())
        recall.append(scores["test_recall"].mean())
        precision.append(scores["test_precision"].mean())
        f1.append(scores["test_f1"].mean())
        roc_auc.append(scores["test_roc_auc"].mean())
  
        # standard_error = sample_standard_deviation / sqrt(number of repeats)
        se.append(sem( (scores["test_roc_auc"] ) ))

        estimators.append(clf.fit(x_train,y_train))
        

    
    models = pd.DataFrame({
        'Estimator': estimators,
        'Model': name,
        'Accuracy': acc,
        'Recall': recall,
        'Precision':precision,
        'F1':f1,
        'ROC_AUC': roc_auc,
        'SE':se
        })


    return models.sort_values(by='ROC_AUC', ascending=False)

def print_score(y_test,y_pred):
    print("Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred) * 100.0))
    print("Precision: ",precision_score(y_test, y_pred))
    print("Recall: ",recall_score(y_test, y_pred))
    print("F1 Score: ",f1_score(y_test, y_pred))
    print("ROC_AUC :", roc_auc_score(y_test, y_pred))

# Train Models

In [None]:
%%time
models = get_classifiers_models_scores_with_CV(X,Y)

# Train Models Results

In [None]:
# sort scores with ROC_AUC 
models 

### Result Analysis 

In [None]:
# Select model that fitted with train data for result analysis
mdl = models.loc[0,"Estimator"] # select LogisticRegression fitted model
mdl
print_score(y_test, mdl.predict(x_test))

In [None]:
plot_confusion_matrix(mdl, x_test, y_test)

In [None]:
plot_roc_curve(mdl, x_test, y_test)

### Hyperparameter Optimization for Logistic Regression

In [None]:
# example of grid searching key hyperparametres for logistic regression
from sklearn.model_selection import GridSearchCV


model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear',"sag", "saga"]
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]


grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=1)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cv, scoring='roc_auc',error_score=0)
grid_result = grid_search.fit(X, Y)

print("Best F1 Score: %f using parameter %s" % (grid_result.best_score_, grid_result.best_params_))

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results = (means,stds,params)


In [None]:
print("Best ROC_AUC Score: %f using parameter %s" % (grid_result.best_score_, grid_result.best_params_))

# Feature İmportance and Selection

## With LogisticRegression

[plot_linear_model_coefficient_interpretation](https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html#checking-the-variability-of-the-coefficients)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import median_absolute_error
from sklearn.model_selection import cross_validate,RepeatedKFold


model = LogisticRegression()

cv_model = cross_validate(
    model, X, Y, cv=RepeatedKFold(n_splits=5, n_repeats=5),
    return_estimator=True, n_jobs=-1
)
coefs = []
for est in cv_model['estimator']:
    if(len(est.coef_) != 0 ):
        coefs.append( est.coef_[0] * x_train.std(axis=0))
        

coefs = pd.DataFrame(coefs,
    columns=X.columns
)

plt.figure(figsize=(9, 30))
sns.stripplot(data=coefs, orient='h', alpha=0.5)
sns.boxplot(data=coefs, orient='h', saturation=0.5)
plt.axvline(x=0, color='.5')
plt.xlabel('Coefficient importance')
plt.title('Coefficient importance and its variability')

In [None]:
# Show feature coefficent values at top 20
coefs.apply(np.abs).mean().sort_values(ascending=False).head(20)

## With XGBClassifier 

[Feature importance based on feature permutation](https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html#sphx-glr-auto-examples-ensemble-plot-forest-importances-py)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
from sklearn import tree
import time


cols_for_drop = [] 
feature_names = x_train.drop(columns=cols_for_drop).columns
model = XGBClassifier()
model.fit(x_train.drop(columns=cols_for_drop),y_train)

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    model, x_test.drop(columns=cols_for_drop), y_test, n_repeats=10, random_state=42, n_jobs=-1)

forest_importances = pd.Series(result.importances_mean, index=feature_names)
plt.figure(figsize=(30,10))
forest_importances.plot.bar(yerr=result.importances_std)

In [None]:
#result = permutation_importance(model, x_train.drop(columns=cols_for_drop), y_train, n_repeats=10,random_state=42)

perm_sorted_idx = result.importances_mean.argsort()

plt.figure(figsize=(15, 20))

plt.boxplot(result.importances[perm_sorted_idx].T, vert=False,
            labels=feature_names[perm_sorted_idx])
plt.tight_layout()
plt.show()

[Xgboost Plot Importance](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.plot_importance)

In [None]:
from xgboost import plot_importance,plot_tree

plt.rcParams["figure.figsize"] = (14, 14)
plot_importance(model)


- [Understanding the decision tree structure](https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py)
- [Xgboost Plot Tree](https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.plot_tree)

In [None]:
from matplotlib.pylab import rcParams

plt.rcParams["figure.figsize"] = (250, 500)
plot_tree(model)

## Feature Selection

In [None]:
# use feature importance for feature selection

from numpy import sort
from sklearn.feature_selection import SelectFromModel


model = XGBClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]

thresholds = sort(model.feature_importances_)

acc = []
roc = []
recall = []
precision = []
f1 = []
treshold = []
diff = []

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(x_train)
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    select_X_test = selection.transform(x_test)
    
    acc.append(accuracy_score(y_test, predictions))
    roc.append(roc_auc_score(y_test, predictions))
    recall.append(recall_score(y_test, predictions))
    precision.append(precision_score(y_test, predictions))
    f1.append(f1_score(y_test, predictions))
    features = list(x_train.columns[selection.get_support()])
    treshold.append(features)
    diff.append( list(set(x_train.columns) - set(features)) )

    y_pred = selection_model.predict(select_X_test)
    
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))

models = pd.DataFrame({
        'Model': treshold,
        'Diff':diff,
        'Accuracy': acc,
        'Recall': recall,
        'Precision':precision,
        'F1':f1,
        'ROC_AUC': roc
        })

In [None]:
models