In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**File Handling**

In [None]:
df=pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

In [None]:
#General info about the data set attributes
df.describe()

**Pre-processing & Correlation check**

In [None]:
#Dropping unwanted columns
df=df.drop(['EmployeeNumber','EmployeeCount','StandardHours'], axis=1)
df.isna().sum() #Check if there is any 'NaN' values

In [None]:
df.isnull().values.any() #Check if there is missing values

In [None]:
#Creating correleation matrix showing connection rates between attributes
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(30, 30))
sns.heatmap(df.corr(), annot=True, cmap="RdYlGn",annot_kws={"size":15})

In [None]:
#Defining the categorial columns
categorial_col = df.select_dtypes(include="object")
categorial_col.head()

In [None]:
#Converting categorial attributions to numbers.
from sklearn.preprocessing import LabelEncoder
lr = LabelEncoder()

for i in categorial_col:
    df[i]=lr.fit_transform(df[i])
    
df[categorial_col.columns].head()

In [None]:
#Explaining the connection between categorial data to the target feature
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=1.1)
plt.figure(figsize=(30, 30))

for i, column in enumerate(categorial_col, 1):
    plt.subplot(3, 3, i)
    g = sns.barplot(x=f"{column}", y='Attrition', data=df)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)
    plt.ylabel('Attrition Count')
    plt.xlabel(f'{column}')

In [None]:
#Converting attrition values to binary [No Attrition=0, Yes Attrition=1]
df['Attrition'] = df.Attrition.astype("category").cat.codes
df.Attrition.value_counts()

**Train-Test split**

In [None]:
#We split the data into test and train
from sklearn.model_selection import train_test_split

X = df.drop('Attrition', axis=1)
y = df.Attrition

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
X_train.shape

**Algorithms**
Decision Tree Classifier

In [None]:
#Applying Decision Tree Clasiffier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)

In [None]:
#Checking the accuracy score for train
y_train_pred = dtc.predict(X_train)
y_train_prob = dtc.predict_proba(X_train)[0,1]
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve,confusion_matrix, f1_score
accuracy_score(y_train, y_train_pred)

In [None]:
#Since we have accuracy score of 1.0, we have the model on overfitting, hence, we do hypermeter tuning to avoid that and then scoring
from sklearn.model_selection import GridSearchCV

params = {
    "criterion":("gini", "entropy"), 
    "splitter":("best", "random"), 
    "max_depth":(list(range(1, 20))), 
    "min_samples_split":[2, 3, 4], 
    "min_samples_leaf":list(range(1, 20)), 
}


dtc = DecisionTreeClassifier(random_state=42)
tree_cv = GridSearchCV(dtc, params, scoring="accuracy", n_jobs=-1, verbose=1, cv=3)
tree_cv.fit(X_train, y_train)
best_params = tree_cv.best_params_
print(f"Best paramters: {best_params})")

In [None]:
#Making the model on the best params created by the model_selection
dtc_best = DecisionTreeClassifier(**best_params)
dtc_best.fit(X_train, y_train)
#Accuracy score of train
y_train_pred = dtc_best.predict(X_train)
y_train_prob = dtc_best.predict_proba(X_train)[0,1]
print("Accuracy score for Train:",accuracy_score(y_train, y_train_pred))
#ROC_AUC score for visualizing the score
y_test_pred_DTC = dtc_best.predict(X_test)
y_test_prob_DTC = dtc_best.predict_proba(X_test)[:,1]
print("ROC_AUC score for Decsision Tree Classifier: ",roc_auc_score(y_test, y_test_prob_DTC))
#Plotting confusion matrix
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(dtc_best, X_test, y_test,normalize="true", cmap="Blues")

DTCfpr, DTCtpr, DTCthresholds = roc_curve(y_test, y_test_prob_DTC)

In [None]:
#Visualizing the tree in order to get insights
from IPython.display import Image
from io import StringIO
from sklearn.tree import export_graphviz
import pydot

features = list(df.columns)
features.remove("Attrition")
dot_data = StringIO()
export_graphviz(dtc_best, out_file=dot_data, feature_names=features, filled=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph[0].create_png())

Logistic Regression

In [None]:
#Applying Logisitic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
log_reg_model = LogisticRegression(max_iter=1000, solver = "newton-cg")
log_reg_model.fit(X_train, y_train)
#Accuracy score of train
y_train_pred = log_reg_model.predict(X_train)
y_train_prob = log_reg_model.predict_proba(X_train)[0,1]
print("Accuracy score for Train:",accuracy_score(y_train, y_train_pred))
#ROC_AUC score for visualizing the score
y_test_pred_LR = log_reg_model.predict(X_test)
y_test_prob_LR = log_reg_model.predict_proba(X_test)[:,1]
print("ROC_AUC score for Logistic Regression: ",roc_auc_score(y_test, y_test_prob_LR))
print(classification_report(y_test, y_test_pred_LR))
#Plotting confusion matrix
plot_confusion_matrix(log_reg_model, X_test, y_test, normalize="true", cmap="Blues")
LRfpr, LRtpr, LRthresholds = roc_curve(y_test, y_test_prob_LR)


Random Forest Classifier

In [None]:
#Applying Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=700,max_depth=10,n_jobs=-1,random_state=123)
rfc.fit(X_train,y_train)
#Accuracy score of train
y_train_pred = rfc.predict(X_train)
y_train_prob = rfc.predict_proba(X_train)[0,1]
print("Accuracy score for Train:",accuracy_score(y_train, y_train_pred))
#ROC_AUC score for visualizing the score
y_test_pred_RFC = rfc.predict(X_test)
y_test_prob_RFC = rfc.predict_proba(X_test)[:,1]
print("ROC_AUC score for Random Forest Classifier: ",roc_auc_score(y_test, y_test_prob_RFC))
#Plotting confusion matrix
plot_confusion_matrix(rfc, X_test, y_test, normalize="true", cmap="Blues")
RFCfpr, RFCtpr, RFCthresholds = roc_curve(y_test, y_test_prob_RFC)

In [None]:
#Plotting importance of attributes that are responsible for attrition according to the RFC model
def plot_feature_importance(importance,names,model_type): 
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data) 
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True) 
    
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
plot_feature_importance(rfc.feature_importances_,X_train.columns,'RANDOM FOREST')

Results using ROC AUC score graph

In [None]:
plt.figure(dpi=150)
plt.plot(DTCfpr, DTCtpr, color='orange', label='Decision Tree Classifier ROC')
plt.plot(LRfpr, LRtpr, color='blue', label='Logistic Regression ROC')
plt.plot(RFCfpr, RFCtpr, color='red', label='Random Forest Classifier ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')

**We are sorry for the uncomfortable grid on the confusion matrix. Unknown issue that isn't shown on the final report**