In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os

In [None]:
folder_name = os.listdir('../input/')[0]
folder_name

In [None]:
df = pd.read_csv(f'../input/{folder_name}/HR_comma_sep.csv')

df


In [None]:
df.describe(percentiles=[.05,.25,.5,.75,.90,.95,.98,1])


In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.count()

In [None]:
df.nunique()

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,10))

sns.heatmap(df.corr(),annot=True)

In [None]:
pd.value_counts(df['salary'])

In [None]:
salary_value = np.array(pd.value_counts(df['salary']).index)
salary_value

In [None]:
df1  = df.copy()

In [None]:
df1['salary'] = df1['salary'].map({salary_value[0]:0,salary_value[1]:1,salary_value[2]:2})

In [None]:
df1

In [None]:
pd.value_counts(df['Department'])

In [None]:
department_values = np.array(pd.value_counts(df['Department']).index)
department_values

In [None]:
df.columns =[i.lower() for i in df.columns]
df1.columns =[i.lower() for i in df1.columns]

In [None]:
df.columns

In [None]:
df1.columns

In [None]:
dummy = pd.get_dummies(df1['department'],drop_first=True)

In [None]:
dummy.columns = [i.lower() for i in dummy.columns]

In [None]:
dummy.head(10)

In [None]:
df1 = pd.concat([df1,dummy],axis=1)

In [None]:
df1

In [None]:
df1 = df1.drop(['department'],axis=1)

In [None]:
df1

In [None]:
left_0 = df[df['left']==0]
left_0

In [None]:
left_1 = df[df['left']==1]
left_1

In [None]:
count_dic = {0 : pd.value_counts(left_0['salary']).to_numpy(),
             1 : pd.value_counts(left_1['salary']).to_numpy()}

In [None]:
count_dic['name'] =  salary_value
count_dic

In [None]:
fig = plt.figure()
x = np.arange(0,len(salary_value))
ax = fig.add_axes([0,0,1,1])
ax.bar(x+0, count_dic[0],width = 0.25 )
ax.bar(x+0.25, count_dic[1],width = 0.25)
ax.set_yscale('log')
ax.set_xticks([0.125,1.125,2.125])
ax.set_xticklabels(salary_value)
ax.legend([0,1])
plt.show()

In [None]:
dummy['left'] = df['left']

In [None]:
plt.figure(figsize=(10,10))

sns.heatmap(dummy.corr(),annot=True)

In [None]:
dummy_salary = pd.get_dummies(df['salary'])
dummy_salary['left'] = df['left']
dummy_salary.corr()

In [None]:
plt.figure(figsize=(10,10))

sns.heatmap(dummy_salary.corr(),annot=True)

In [None]:
fig,plot = plt.subplots(1,2,figsize=(16,6))

plot[0].bar(x,dummy_salary.corr()['left'][0:3].values,color='c')
plot[0].set_xticks(x)
plot[0].set_xticklabels(salary_value)
plot[0].set_xlabel('salary')
plot[0].set_ylabel('correlation')
plot[0].set_title('left/salary relation')
# plot[0] = dummy_salary.corr()['left'][0:3].plot(kind='bar')
# plot[1]=dummy.corr()['left'][0:9].plot(kind='bar')
plot[1].bar(np.arange(0,9),dummy.corr()['left'][0:9].values,color='m')
plot[1].set_xticks(np.arange(0,9))
plot[1].set_xticklabels(dummy.columns[0:9],rotation='vertical')
plot[1].set_xlabel('department')
plot[1].set_ylabel('correlation')
plot[1].set_title('left/department relation')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df1.drop('left', axis=1),df1[['left']] , test_size=0.3, random_state=100)


In [None]:
import statsmodels.api as sm

In [None]:
x_train_constant = sm.add_constant(x_train)

In [None]:
model = sm.GLM(y_train, x_train_constant, family=sm.families.Binomial())
model=model.fit()
model.summary()

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
logr = RFE(LogisticRegression(),10)
logr = logr.fit(x_train,y_train)
# logr.support_

In [None]:
rfe_col = x_train.columns[logr.support_]

In [None]:
x_train_constant = sm.add_constant(x_train[rfe_col])
model = sm.GLM(y_train, x_train_constant, family=sm.families.Binomial())
model=model.fit()
model.summary()

In [None]:
def vif(data):
    data_frame = pd.DataFrame(columns=['col_name','vif'])
    x_var_name = data.columns
    for i in range(len(x_var_name)):
        y_temp = data[x_var_name[i]]
        x_temp = data.drop(x_var_name[i],axis=1)
        r2 = sm.OLS(y_temp,x_temp).fit().rsquared
        vif = round(1/(1-r2),2)
        data_frame.loc[i] = [x_var_name[i],vif]
    return data_frame.sort_values(by='vif', ascending=False)  

In [None]:
x_train=x_train[rfe_col]
vif(x_train)

In [None]:
main_model = LogisticRegression()
main_model.fit(x_train,y_train)

In [None]:
x_test = x_test[rfe_col]

In [None]:
pred = main_model.predict(x_test)

In [None]:
pred_prob = main_model.predict_proba(x_test)

In [None]:
pred_prob

In [None]:
prediction = y_test
prediction.index = np.arange(0,len(y_test))

In [None]:
prediction['pred_prob']= pred_prob[:,0]

In [None]:
prediction['pred'] = prediction.pred_prob.apply(lambda x:0 if x>=0.5 else 1)

In [None]:
prediction

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
accuracy_score(prediction['left'],prediction['pred'])

In [None]:
accuracy_score(y_test['left'],pred)

In [None]:
confusion_matrix = confusion_matrix(prediction['left'],prediction['pred'])
confusion_matrix

In [None]:
TN = confusion_matrix[0,0]
FP = confusion_matrix[0,1]
TP = confusion_matrix[1,1]
FN = confusion_matrix[1,0]

In [None]:
TP/float(TP+TN) #recall, sensitivity

In [None]:
TN/float(FP+TN) #specificity