In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Importing Libraries**

In [None]:
# Packages / libraries
import matplotlib
matplotlib.rcParams['backend'] = 'module://ipykernel.pylab.backend_inline'
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, confusion_matrix, accuracy_score, classification_report, log_loss
from math import sqrt
#%matplotlib inline
# To install sklearn type "pip install numpy scipy scikit-learn" to the anaconda termi
# To change scientific numbers to float
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
# Increases the size of sns plots
sns.set(rc={'figure.figsize':(12,10)})
# import sys
# !conda list Check the packages installed

In [None]:
df=pd.read_csv("../input/hr-analytics/HR_comma_sep.csv")

# **Data Explotary Analysis**

In [None]:
df

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df["Department"].unique()

In [None]:
df["salary"].unique()

 **Average  Numbers for column**

In [None]:
df.groupby('left').mean()

***0 means they retained, 1 means they left***

from above data we can point out that
1. If the **satisfication level** is *low* ,employee left.
2. If the **average monthly hours** is High, they left.
3. **Promotion_last_5years** : if employee doesn't get any promotion they left


## **Impact of salary**

In [None]:
pd.crosstab(df.salary,df.left).plot(kind='bar')

**From the chart we can see that, most of the employee with high salary are retaining.**

**Impact of Department**

In [None]:
pd.crosstab(df.Department,df.left).plot(kind='bar')

From above chart we are not sure how much impact on department

# Making categorical variables into numeric representation

In [None]:
new_data=pd.get_dummies(df, columns = ['salary','Department'])
new_data

In [None]:
print(df.shape)
print(new_data.shape)

# Feature Selection:

- Steps of Running Feature Importance
- Split the data into X & y
- Run a Tree-based estimators (i.e. decision trees & random forests)
- Run Feature Importance

In [None]:
# Split the data into X & y

X = new_data.drop(["left"],axis="columns")
print(X.shape)

y=new_data.left
print(y.shape)

# Making sure y as integer
y = y.astype(int)

## **Run a Tree-based estimators**

In [None]:
dt = DecisionTreeClassifier(random_state=15, criterion = 'entropy', max_depth = 10)
dt.fit(X,y)

In [None]:
# Running Feature Importance
fi_col = []
fi = []

for i,column in enumerate(new_data.drop(["left"],axis="columns")):
    print('The feature importance for {} is : {}'.format(column, dt.feature_importances_[i]))
    
    fi_col.append(column)
    fi.append(dt.feature_importances_[i])

**Notes: We want to import them as a dataframe and select the highest importance feature**

In [None]:
# Creating a Dataframe
# zip two list

fi_df = zip(fi_col, fi)
fi_df = pd.DataFrame(fi_df, columns = ['Feature','Feature Importance'])
fi_df

In [None]:
# Ordering the data
fi_df = fi_df.sort_values('Feature Importance', ascending = False).reset_index()

fi_df


In [None]:
# Creating columns to keep
columns_to_keep = fi_df['Feature'][0:13]

columns_to_keep

# **Hold out Validation**

In [None]:
print(new_data.shape)
print(new_data[columns_to_keep].shape)


In [None]:
X = new_data[columns_to_keep].values
X

In [None]:
y = new_data.left
y = y.astype(int)
y

print(X.shape)
print(y.shape)

In [None]:
# Hold-out validation

# first one
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size=0.2, random_state=15)

# Second one(From Training)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size = 0.9, test_size=0.1, random_state=15)

print(X_train.shape)
print(X_test.shape)
print(X_valid.shape)

print(y_train.shape)
print(y_test.shape)
print(y_valid.shape)

In [None]:
# Investigating the distribution  of all y so that we can know is it balanced data or not

ax = sns.countplot(x =y_train, palette = "Set3")


In [None]:
ax = sns.countplot(x =y_test, palette = "Set3")

In [None]:
ax = sns.countplot(x =y_valid, palette = "Set3")

It doesn't look a balanced data

# **Running Model**

In [None]:
# Training my model

log_reg = LogisticRegression(random_state=10, solver = 'lbfgs')

log_reg.fit(X_train, y_train)

In [None]:
# Methods we can use in Logistic

# predict - Predict class labels for samples in X
log_reg.predict(X_train)
y_pred = log_reg.predict(X_train)
y_pred

In [None]:
# predict_proba - Probability estimates
pred_proba = log_reg.predict_proba(X_train)
pred_proba

**Notes:**

We calculate probability bcz we want to label class to be 0 or 1

Probability>0.5 = 1
Probability<0.5 = 0 First row, 0.63>0.5 so class is 0. check y_pred, it's 0

**Now we have to calculate coeifficient**

In [None]:
# coef_ - Coefficient of the features in the decision function
log_reg.coef_

# score- Returns the mean accuracy on the given test data and labels - below

# **Evaluating Model**

In [None]:
# Accuracy on Train
print("The Training Accuracy is: ", log_reg.score(X_train, y_train))

# Accuracy on Test
print("The Testing Accuracy is: ", log_reg.score(X_test, y_test))

In [None]:
# Classification Report
print(classification_report(y_train, y_pred))

In [None]:
# Confusion Matrix function

def plot_confusion_matrix(cm, classes=None, title='Confusion matrix'):
    """Plots a confusion matrix."""
    if classes is not None:
        sns.heatmap(cm, cmap="YlGnBu", xticklabels=classes, yticklabels=classes, vmin=0., vmax=1., annot=True, annot_kws={'size':50})
    else:
        sns.heatmap(cm, vmin=0., vmax=1.)
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Visualizing cm

cm = confusion_matrix(y_train, y_pred)
cm

In [None]:
cm.sum(axis=1)

In [None]:
cm_norm = cm / cm.sum(axis=1).reshape(-1,1)
cm_norm

In [None]:
# What are the classes
log_reg.classes_

In [None]:
plot_confusion_matrix(cm_norm, classes = log_reg.classes_, title='Confusion matrix')

- We have predicted actual 0->93%, where 7.3% 0 we predicted as 1 

- 0.68% predicted as 0 where actual is 1 and 32% is predicted correctly as 1

In [None]:
# Calculating False Positives (FP), False Negatives (FN), True Positives (TP) & True Negatives (TN)

FP = cm.sum(axis=0) - np.diag(cm)
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP / (TP + FN)
print("The True Positive Rate is:", TPR)

# Precision or positive predictive value
PPV = TP / (TP + FP)
print("The Precision is:", PPV)

# False positive rate or False alarm rate
FPR = FP / (FP + TN)
print("The False positive rate is:", FPR)


# False negative rate or Miss Rate
FNR = FN / (FN + TP)
print("The False Negative Rate is: ", FNR)



##Total averages :
print("")
print("The average TPR is:", TPR.sum()/2)
print("The average Precision is:", PPV.sum()/2)
print("The average False positive rate is:", FPR.sum()/2)
print("The average False Negative Rate is:", FNR.sum()/2)

**False Positive rate is quite high**

# Logarithmic loss - or Log Loss - or cross-entropy loss

In [None]:
# Running Log loss on training
print("The Log Loss on Training is: ", log_loss(y_train, pred_proba))

# Running Log loss on testing
pred_proba_test = log_reg.predict_proba(X_test)
print("The Log Loss on Testing Dataset is: ", log_loss(y_test, pred_proba_test))

**For training & testing both Log Loss same**

# Hyper Parameter Tuning

- We will loop over parameter C (Inverse of regularization strength).
- Inverse of regularization strength helps to avoid overfitting - it penalizes large values of your parameters
- It also helps to find Global Minimum by moving to better "solutions" from local minimum to global minimum
- The values of C to search should be n-equally-spaced values in log space ranging from 1e-5 to 1e5

In [None]:
np.geomspace(1e-5, 1e5, num=20)

In [None]:
# Creating a range for C values
np.geomspace(1e-5, 1e5, num=20)

# ploting it
plt.plot(np.geomspace(1e-5, 1e5, num=20)) #  uniformly distributed in log space
plt.plot(np.linspace(1e-5, 1e5, num=20)) # uniformly distributed in linear space, instead of log space

In [None]:
# Looping over the parameters

C_List = np.geomspace(1e-5, 1e5, num=20)
CA = []
Logarithmic_Loss = []

for c in C_List:
    log_reg2 = LogisticRegression(random_state=10, solver = 'lbfgs', C=c)
    log_reg2.fit(X_train, y_train)
    score = log_reg2.score(X_test, y_test)
    CA.append(score)
    print("The CA of C parameter {} is {}:".format(c, score))
    pred_proba_t = log_reg2.predict_proba(X_test)
    log_loss2 = log_loss(y_test, pred_proba_t)
    Logarithmic_Loss.append(log_loss2)
    print("The Logg Loss of C parameter {} is {}:".format(c, log_loss2))
    print("")

**We want to choose lowest logarithmic and highest classification accuracy**

In [None]:
# putting the outcomes in a Table

# reshaping
CA2 = np.array(CA).reshape(20,)
Logarithmic_Loss2 = np.array(Logarithmic_Loss).reshape(20,)

# zip
outcomes = zip(C_List, CA2, Logarithmic_Loss2)

#df
df_outcomes = pd.DataFrame(outcomes, columns = ["C_List", 'CA2','Logarithmic_Loss2'])

#print
df_outcomes

# Ordering the data (sort_values)
df_outcomes.sort_values("Logarithmic_Loss2", ascending = True).reset_index()

In [None]:
# Another way we can do it

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, random_state=0, shuffle=True)

# Logistic Reg CV
Log_reg3 = LogisticRegressionCV(random_state=15, Cs = C_List, solver ='lbfgs')
Log_reg3.fit(X_train, y_train)

pred_proba_t = Log_reg3.predict_proba(X_test)
log_loss3 = log_loss(y_test, pred_proba_t)


In [None]:
print("The CA is:", Log_reg3.score(X_test, y_test))

print("The Logistic Loss is: ", log_loss3)

print("The optimal C parameter is: ", Log_reg3.C_)

In [None]:
# Maybe we have a different metric we want to track

# Looping over the parameters

C_List = np.geomspace(1e-5, 1e5, num=20)
CA = []
Logarithmic_Loss = []

for c in C_List:
    log_reg2 = LogisticRegression(random_state=10, solver = 'lbfgs', C=c)
    log_reg2.fit(X_train, y_train)
    score = log_reg2.score(X_test, y_test)
    CA.append(score)
    print("The CA of C parameter {} is {}:".format(c, score))
    pred_proba_t = log_reg2.predict_proba(X_test)
    log_loss2 = log_loss(y_test, pred_proba_t)
    Logarithmic_Loss.append(log_loss2)
    print("The Logg Loss of C parameter {} is {}:".format(c, log_loss2))
    print("")
    
    y_pred = log_reg2.predict(X_train)
    cm = confusion_matrix(y_train, y_pred)
    cm_norm = cm / cm.sum(axis=1).reshape(-1,1)
    plot_confusion_matrix(cm_norm, classes = log_reg.classes_, title='Confusion matrix')
    plt.show()

In [None]:
# Training a Dummy Classifier

from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
score = dummy_clf.score(X_test, y_test)

pred_proba_t = dummy_clf.predict_proba(X_test)
log_loss2 = log_loss(y_test, pred_proba_t)

print("Testing Acc:", score)
print("Log Loss:", log_loss2)


In [None]:
# Final Model 

log_reg3 = LogisticRegression(random_state=10, solver = 'lbfgs', C=784.759970)
log_reg3.fit(X_train, y_train)
score = log_reg3.score(X_valid, y_valid)

pred_proba_t = log_reg3.predict_proba(X_valid)
log_loss2 = log_loss(y_valid, pred_proba_t)

print("Testing Acc:", score)
print("Log Loss:", log_loss2)

**Notes: We test our data with totally unseen data ( Xvalid). & we have seen that log loss fell down**