In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_company = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
df_company.info()


### Let us check for null values
- Let us see the null values
- We will have the to treat them before training the model

In [None]:
df_company.isnull().sum()

df_company['Attrition'] = df_company['Attrition'].apply(lambda x: 0 if x.lower() == 'no' else 1)
df_company['Attrition'].value_counts()

### Get categorical and Numeric columns from the data Set 

In [None]:
def identify_cat(df):
    cat =[]
    con = []
    for col in df.columns:
        try:
            df[col].apply(lambda x: float(x))
            con.append(col)
        except:
            cat.append(col)
    

    return con,cat

In [None]:
con,cat = identify_cat(df_company)

In [None]:
con

In [None]:
cat

In [None]:
plt.figure(figsize=(30,60))
i = 1
for col in con:
    plt.subplot(9,3,i)
    sns.boxplot(data = df_company,x='Attrition',y = col)
    i +=1 

In [None]:
#df_company['StandardHours'].value_counts()
df_company = df_company.drop('StandardHours',axis=1)
df_company = df_company.drop('EmployeeCount',axis=1)

In [None]:
def concatenate(df,cat_cols):
    for c in cat_cols:
        temp = pd.get_dummies(df[c],drop_first=True,prefix=c)
        df = pd.concat([df,temp],axis=1)
        df = df.drop(c,axis = 1)
    return df

In [None]:
df_company =  concatenate(df_company,cat)

### Let us observe the shape of the data set

In [None]:
len(df_company['EmployeeNumber'] ) == len(set(df_company['EmployeeNumber']))

### We have to drop the Employee Number

In [None]:
df_company = df_company.drop('EmployeeNumber',axis=1)

## Importing Libraries for model building

In [None]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE

In [None]:
y = df_company.pop('Attrition')

In [None]:
X_train,X_test,y_train,y_test = train_test_split(df_company,y,train_size =0.8, random_state=40)

In [None]:
sc = MinMaxScaler()
X_train[X_train.columns] = sc.fit_transform(X_train)
X_test[X_test.columns] = sc.transform(X_test)

In [None]:
rfe = RFE(LogisticRegression(),n_features_to_select=20)

In [None]:

rfe = rfe.fit(X_train,y_train)
#X_train_sm = sm.add_constant(X_train)

In [None]:
X_train_sm = X_train.copy()
columns = X_train_sm.columns[rfe.support_]
X_train_sm = X_train_sm[columns]

### RFE on test data

In [None]:
#X_test_sm = X_test[X_test_sm.columns[rfe.support_]]

X_train_sm = sm.add_constant(X_train_sm)

In [None]:
X_train_sm

In [None]:
lm = sm.GLM(y_train,X_train_sm,family=sm.families.Binomial()).fit()
lm.summary()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def check_vif(df):
    res = pd.DataFrame()
    res['Columns'] = df.columns
    res['VIF'] = [variance_inflation_factor(df.values,i) for i in range(len(df.columns))]
    return res

In [None]:
vif = check_vif(X_train_sm)
vif.sort_values(by='VIF',ascending=False)

## P- values and VIFs are stable we can now train the model
1. We use the same model 
2. Choose cutoff

In [None]:
y_train_pred = lm.predict(X_train_sm)

def matrix(cuttoffs,predictions,true_val):
    matrix = np.zeros((10,2,2))
    for i,c in enumerate(cutoffs):
        p = predictions.apply(lambda x: 1 if x >= c else 0) 
        matrix[i,:,:] = confusion_matrix(true_val,p)
    return matrix
        

### Let us print metrics for different cutoffs

In [None]:
cutoffs = np.linspace(0.0,0.9,10)
cf = matrix(cutoffs,y_train_pred,y_train)

In [None]:
def metrics(cf):
    for i in range(cf.shape[0]):
        acc = (cf[i,0,0] + cf[i,1,1])/(cf[i,0,0] + cf[i,0,1]+ cf[i,1,0] + cf[i,1,1])
        recall = (cf[i,1,1])/(cf[i,1,1] + cf[i,1,0])
        precision = (cf[i,1,1])/(cf[i,1,1] + cf[i,0,1])
        specificity = (cf[i,0,0])/(cf[i,0,0] + cf[i,0,1])
        print(f"cutoff = {round(0.1*i,2)} -- acc={acc} -- recall={recall} -- precision={precision} -- specificity={specificity}\n\n")
        
metrics(cf)

### The data is imbalanced let us try SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(n_jobs=-1,random_state=1)
X_train_sm,y_train = smote.fit_resample(X_train_sm,y_train)
y_train_pred = lm.predict(X_train_sm)

In [None]:
lm = sm.GLM(y_train,X_train_sm,family=sm.families.Binomial()).fit()
y_train_pred = lm.predict(X_train_sm)
cf = matrix(cutoffs,y_train_pred,y_train)
cf

In [None]:
metrics(cf)

## cutoff 0.5 seems to be best by observing metrics

In [None]:
X_test_sm = X_test[X_test.columns[rfe.support_]]
X_test_sm = sm.add_constant(X_test_sm)

## Performing predictions

In [None]:
y_test_pred = lm.predict(X_test_sm)
y_test_pred = y_test_pred.apply(lambda x: 1 if x >= 0.5 else 0)

In [None]:
cf = confusion_matrix(y_test,y_test_pred)

In [None]:
cf
acc = (cf[0,0] + cf[1,1])/(cf[0,0] + cf[0,1]+ cf[1,0] + cf[1,1])
recall = (cf[1,1])/(cf[1,1] + cf[1,0])
precision = (cf[1,1])/(cf[1,1] + cf[0,1])
specificity = (cf[0,0])/(cf[0,0] + cf[0,1])
print(f"cutoff = 0.6 -- acc={acc} -- recall={recall} -- precision={precision} -- specificity={specificity}\n\n")

### Let us try Decision Trees

In [None]:
params = {
    'max_depth': [5,10,20,50],
    'min_samples_split': [10,20,50,100],
    'min_samples_leaf': [10,20,100],
    #'max_leaf_nodes': [10,50,100,300]
    
}
dt = DecisionTreeClassifier(random_state=100)

from sklearn.model_selection import GridSearchCV

dt_grid = GridSearchCV(estimator=dt,param_grid=params,cv=5,verbose=10,n_jobs=-1,scoring='roc_auc')


In [None]:
X_train,X_test,y_train,y_test = train_test_split(df_company,y,train_size =0.8, random_state=40)
sm = SMOTE()
X_train,y_train = sm.fit_resample(X_train,y_train)
dt_grid = dt_grid.fit(X_train,y_train)

In [None]:
dt = dt_grid.best_estimator_
dt = dt.fit(X_train,y_train)

In [None]:
y_test_pred = dt.predict(X_test)

cf = confusion_matrix(y_test,y_test_pred)

### Let us check the metrics on decision tree

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score
print(accuracy_score(y_test,y_test_pred), precision_score(y_test,y_test_pred), recall_score(y_test,y_test_pred))

### The decision tree didnt predict positive flags well. This is due to imbalance classes

In [None]:
rf = RandomForestClassifier(random_state=500,oob_score=True)

params = {
    "max_depth": [5,10,20],
    'min_samples_split': [10,50,100],
    'min_samples_leaf': [10,50,100],
    "max_features": [10,20],
    "n_estimators": [100,200]
    #'max_leaf_nodes': [10,50,100,500]
    
}



rf_grid = GridSearchCV(estimator=rf,param_grid=params,cv=5,scoring='balanced_accuracy',verbose=10,n_jobs=-1)

In [None]:
rf_grid = rf_grid.fit(X_train,y_train)
rf = rf_grid.best_estimator_

In [None]:
y_test_pred = rf.predict(X_test)
print(accuracy_score(y_test,y_test_pred),recall_score(y_test,y_test_pred),precision_score(y_test,y_test_pred))
print(rf.oob_score_)

In [None]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier

In [None]:
ad_grid = {
    "n_estimators": [50,100,200],
    'learning_rate': [0.01,0.05,0.1]
}
ad = AdaBoostClassifier(random_state=500)

ad_grid = GridSearchCV(estimator=ad,param_grid=ad_grid,cv=5,scoring='balanced_accuracy',n_jobs=-1,verbose=10)

ad_grid = ad_grid.fit(X_train,y_train)

ad = ad_grid.best_estimator_

In [None]:
y_test_pred = ad.predict(X_test)
print(accuracy_score(y_test,y_test_pred),recall_score(y_test,y_test_pred),precision_score(y_test,y_test_pred))

## Boosting is given better results as compared to random forest and decision tree

In [None]:
gb_grid = {
    "n_estimators": [50,100],
    'learning_rate': [0.01,0.05,0.1],
    'max_depth': [3,5,10],
    
}
gb = GradientBoostingClassifier(random_state=500)

gb_grid = GridSearchCV(estimator=gb,param_grid=gb_grid,cv=5,scoring='balanced_accuracy',n_jobs=-1,verbose=10)

gb_grid = gb_grid.fit(X_train,y_train)

gb = gb_grid.best_estimator_

In [None]:
y_test_pred = gb.predict(X_test)
print(accuracy_score(y_test,y_test_pred),recall_score(y_test,y_test_pred),precision_score(y_test,y_test_pred))

### Observation and Results:


- If accuracy is the key then Tree models will work well
- Here we also want to give importance to both the classes so I would prefer Logistic regression which gave me good recall score
- We can play with cutoff values I have taken 0.5
- If focus is more on accuracy the model will predict everything as `0` and predictions are biased
- Hence I anve used SMOTE() for imbalanced data
- The tree models(decision tree, random forest, boosting) seem to have been overfitted due to low volume data if the data was larger trees would perform better
