In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/employee-attrition/HR-Employee-Attrition.csv')

In [None]:
df[:5]

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df_category_columns = df.select_dtypes(exclude=np.number).columns
df_category_columns[:10]

In [None]:
df_number_columns = df.select_dtypes(include=np.number).columns
df_number_columns[:25]

In [None]:
# Age left as it is as it is a numerical column
df['Age'].nunique()

In [None]:
# Attrition is our target variable but the 'No' is very much higher than 'Yes'
### Need to fine tune later
df['Attrition'].value_counts()

In [None]:
# Label Encode the Attrition Column
df['Attrition'] = np.where(df['Attrition'] == 'No', 0, 1)

In [None]:
df[:5]

In [None]:
#One Hot Encoding of Business Travel , Department, Educational Field, Marital Status

#'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime'

data_to_encode = df[df_category_columns].drop(columns = ['Gender', 'Over18', 'OverTime'] )
data_to_encode[:3]

In [None]:
df_category_encoded =  pd.get_dummies(data_to_encode)

In [None]:
df_category_encoded[:5]

In [None]:
# Concatenating One hot encoded values , numerical columns and other non-encoded columns
df_final = pd.concat([df_category_encoded, df[df_number_columns] ,    
                      df[['Gender', 'Over18', 'OverTime']]], axis = 1)

In [None]:
df_final.head()

In [None]:
df_final.info()

In [None]:
# Label Encoding of Gender
df_final['Gender'] = np.where(df_final['Gender'] == 'Male', 0, 1)

In [None]:
# Label Encoding of OverTime
df_final['OverTime'] = np.where(df_final['OverTime'] == 'Yes', 1, 0)

In [None]:
df_final[:5]

In [None]:
#Remove EmployeeCount Column as it adds no value
del df_final['EmployeeCount']

In [None]:
#Remove EmployeeNumber Column as its values are in a linear format 
del df_final['EmployeeNumber']

In [None]:
#Remove Over18 Column as its values are all 'Y'
del df_final['Over18']

In [None]:
# Remove StandardHours as its value is always '80'
del df_final['StandardHours']

In [None]:
df_final[:5]

In [None]:
df_final.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df.corr()

In [None]:
sns.displot(df_final['MonthlyIncome'])
plt.show()

In [None]:
sns.displot(df['BusinessTravel'])
plt.show()

# # # **Modelling**

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [None]:
y = df_final.pop('Attrition')

In [None]:
df_final[:5]

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df_final, y, test_size = 0.25, random_state=43)

In [None]:
model = LogisticRegression(max_iter=5000)


In [None]:
model.fit(train_x, train_y)

In [None]:
model.coef_, model.intercept_

In [None]:
train_pred = model.predict(train_x)
test_pred = model.predict(test_x)

In [None]:
def metric_score(actual, pred):
    print('confusion matrix: ', confusion_matrix(actual, pred))
    print('accuracy score: ', accuracy_score(actual, pred))
    print('recall_score: ', recall_score(actual, pred))
    print('precision score: ', precision_score(actual, pred))
    print('f1 score: ', f1_score(actual, pred))
    print('roc auc score: ', roc_auc_score(actual, pred))
    print('classification report : ', classification_report(actual, pred))

In [None]:
metric_score(train_y, train_pred)

In [None]:
from sklearn.metrics import plot_roc_curve

In [None]:
plot_roc_curve(model, test_x, test_y)