#### Import Libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

In [None]:
#### Read Dataset

#### Read Dataset

In [None]:
OriginalData=pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv');

In [None]:
OriginalData.head(6)

In [None]:
OriginalData.columns.values

#### Exploratory Data Analysis(EDA)

In [None]:
OriginalData.shape

In [None]:
OriginalData.info(0)

In [None]:
OriginalData.describe()

In [None]:
#### Pandas Profiling

In [None]:
'''Generates profile reports from a pandas DataFrame. The pandas df.describe() function is great but a little basic for serious exploratory data analysis.<br>
pandas_profiling extends the pandas DataFrame with df.profile_report() for quick data analysis.

For each column the following statistics - if relevant for the column type - are presented in an interactive HTML report:

Essentials: type, unique values, missing values
Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range
Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
Most frequent values
Histogram
Correlations highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices
Missing values matrix, count, heatmap and dendrogram of missing values
'''

In [None]:
#!pip install pandas-profiling

In [None]:
#import pandas_profiling

In [None]:
#OriginalData.profile_report()

#### Cleaning of dataset

In [None]:
OriginalData.isnull().values.any()

In [None]:
OriginalData.isna().any()

#### Import libraries for visualization

In [None]:
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

#### Attrition: Visualization

In [None]:
sns.countplot(OriginalData['Attrition'], palette="Set2",saturation=10)

### Check for outliers

#### Visualization : Histograms

In [None]:
numericalData=OriginalData.copy().drop(columns=['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime','Attrition','Over18'])

In [None]:
numericalData.shape

In [None]:
fig = plt.figure(figsize = (20,20))
ax = fig.gca()
numericalData.hist(bins=30,ax=ax);

### Handle Attrition Categorical Value

#### Attrition : Convert "No"->0 and "Yes"->1
'''
Employee leaving the company (0=No, 1=Yes)
'''

In [None]:
OriginalData['Attrition'].describe()

#### create of copy of original dataset

In [None]:
dataset=OriginalData.copy()

In [None]:
attritionStatus={'No':0,'Yes':1}
dataset['Attrition']=dataset['Attrition'].map(attritionStatus)

In [None]:
pd.DataFrame(dataset['Attrition'].value_counts()).T

#### Baseline Model Accuracy

In [None]:
print('Employee not leaving the company : ',round((1233/1470)*100,2),'%')

In [None]:
print('Employee leaving the company : ',round((237/1470)*100,2),'%')

#### Feature Selection Using Correlation
'''
The correlation coefficient has values between -1 to 1 <br>
 - A value closer to 0 implies weaker correlation (exact 0 implying no correlation) <br>
 - A value closer to 1 implies stronger positive correlation <br>
 - A value closer to -1 implies stronger negative correlation <br>
 '''

In [None]:
# Data looks clean with no potential outliers.
#We can drop Employee Count and StanardHours features since they are constant and does not contribute to the model.
corrData=dataset.copy().drop(columns=['StandardHours','EmployeeCount'])
corrData.shape

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(20,20))
cor = corrData.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

#### Conclusion from correlation
'''
Statistical relationship between two variables is referred to as their correlation. The performance of some algorithms can deteriorate if two or more variables are tightly related, called multicollinearity.This is of special importance in Regression. From the above correlation matrix , we find most of the features are uncorrelated.But, there is a high correlation (0.95) between Monthly Income and Job Level.
'''

#### Removing Variables

In [None]:
# Employee number will be used for display purpose
empNo=dataset['EmployeeNumber']
#Target/Respone Variable
response=dataset['Attrition']

In [None]:
# Some variable whose value is not changing, So standard deviation of that variable is Zero. So It is not Significant for analysis.
# Those variable are Employee count, Over18, StandardHours.
dataset=dataset.drop(columns=['EmployeeNumber','Attrition','StandardHours','EmployeeCount','Over18'])

#### Converting Categorical Features to numerical: One Hot Encoding

In [None]:
dataset=pd.get_dummies(dataset,columns=['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','OverTime'],drop_first=True)

In [None]:
dataset.columns

In [None]:
dataset.columns.size

#### Feature Selection using SelectKbest-chi2
'''
Advantage:<br>
· Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.<br>
· Improves Accuracy: Less misleading data means modeling accuracy improves.<br>
· Reduces Training Time: fewer data points reduce algorithm complexity and algorithms train faster.<br>
'''

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
#apply SelectKBest class to extract top 34 best features
k=35
select_feature = SelectKBest(score_func=chi2,k=k)
select_feature.fit(dataset,response)

dfscores=pd.DataFrame(select_feature.scores_)
dfcolumns=pd.DataFrame(dataset.columns)

#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
#naming the dataframe columns
featureScores.columns=['Features','Score']

#print 15 best features
print(featureScores.nlargest(k,'Score'))

In [None]:
dataset.columns[select_feature.get_support()]

In [None]:
impFeatures=dataset[['Age', 'DailyRate', 'DistanceFromHome', 'EnvironmentSatisfaction',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Technical Degree', 'JobRole_Human Resources',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Sales Representative', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'OverTime_Yes']]

In [None]:
impFeatures.shape

#### Splitting into Train and Test Set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(impFeatures,response,test_size=0.3,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

#### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
'''
The main idea is to normalize/standardize (mean = 0 and standard deviation = 1) your features before applying machine learning techniques.
StandardScaler performs the task of Standardization. Usually a dataset contains variables that are different in scale. 
For e.g. an Employee dataset will contain AGE column with values on scale 20-70 and SALARY column with values on scale 10000-80000.
As these two columns are different in scale, they are Standardized to have common scale while building machine learning model.
'''

In [None]:
sc_X=StandardScaler()

#Standard scalar removes columns values and indexs after normalization so we have to provide columns values and indexes again.
X_train2=pd.DataFrame(sc_X.fit_transform(X_train))
X_test2=pd.DataFrame(sc_X.transform(X_test))

X_train2.columns=X_train.columns.values
X_test2.columns=X_test.columns.values

X_train2.index=X_train.index.values
X_test2.index=X_test.index.values

X_train=X_train2
X_test=X_test2

#### Model Building

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model_Reg=LogisticRegression()
model_Reg.fit(X_train,y_train)

In [None]:
X_test.shape

#### Test Data Accuracy

In [None]:
y_pred_reg = model_Reg.predict(X_test)

#### Model Performance Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

acc=accuracy_score(y_test,y_pred_reg)
prec=precision_score(y_test,y_pred_reg)
rec=recall_score(y_test,y_pred_reg)
f1=f1_score(y_test,y_pred_reg)

In [None]:
confusion_matrix(y_test,y_pred_reg)

In [None]:
# Making the Confusion Matrix
rf_cm = confusion_matrix(y_test,y_pred_reg)

# building a graph to show the confusion matrix results
rf_cm_plot = pd.DataFrame(rf_cm, index = [i for i in {"Attrition", "No Attrition"}],
                  columns = [i for i in {"No attrition", "Attrition"}])
plt.figure(figsize = (6,5))
sns.heatmap(rf_cm_plot, annot=True, vmin=5, vmax=90.5, cbar=False, fmt='g')

In [None]:
results=pd.DataFrame([['Logistic Regression',acc,prec,rec,f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall','F1 Score'])
results