In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# for data processing
#encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#To split training data to training and vaidatin data
from sklearn.model_selection import train_test_split

#Reading training data
train_data = pd.read_csv('/kaggle/input/train.csv', header=0)
#print(train_data)
print("Shape of dataframe is: {}".format(train_data.shape))
train_data_copy = train_data.copy()

#Reading test data
test_data = pd.read_csv('/kaggle/input/test.csv', header=0)
test_data_copy = test_data.copy()


In [None]:
#Delete unwanted columns
train_data_copy = train_data.drop(['EmployeeCount','ID'],axis=1)
test_data_copy = test_data.drop(['EmployeeCount','ID'],axis=1)

In [None]:
#observe datatypes of different coloums
train_data_copy.info()

**It have (25-1) numeric data 7 string data.
These string data has to be changed

In [None]:
#Observe statistics of data
train_data_copy.describe()
# train_data_copy.hist(figsize=(20,20))
# plt.show()

In [None]:
#Number of active and Ex employee
#0-> Active
#1-> Ex
train_data_copy['Attrition'].value_counts()

In [None]:

#find % of active and Ex employee
ex_emp = train_data_copy[train_data_copy['Attrition'] == 1].shape[0]
active_emp = train_data_copy[train_data_copy['Attrition'] == 0].shape[0]
total_emp=ex_emp + active_emp

print("Percentage of Current Employees is: {:.1f}% and of Ex-employees is: {:.1f}%".format((ex_emp/total_emp)*100,(active_emp/total_emp)*100))
#print(ex_emp)
#print(active_emp)



    As shown on the chart above, we see this is an imbalanced class problem. Indeed, the percentage of Current Employees in our dataset is 83.3% and the percentage of Ex-employees is: 16.7%

    Machine learning algorithms typically work best when the number of instances of each classes are roughly equal. We will have to address this target feature imbalance prior to implementing our Machine Learning algorithms.



In [None]:
#Let's take a look at some of most significant correlations.
#It is worth remembering that correlation coefficients only measure linear correlations.


# Find correlations with the target and sort
df_HR_trans = train_data_copy.copy()
df_HR_trans['Target'] = df_HR_trans['Attrition']
df_HR_trans = df_HR_trans.drop('Attrition',axis=1)

correlations = df_HR_trans.corr()['Target'].sort_values()
print('Most Positive Correlations: \n', correlations.tail(6))
print('\nMost Negative Correlations: \n', correlations.head(6))

# print(correlations)
# correlations.shape
#df_HR_trans.shape

Conclusion based on correlation
1. The strongest positive correlations with the target features are: PerformanceRating, PercentageSaaryHike, MonthlyRate,NumCompaniesWorked, DistanceFromHome.
2. The strongest negative correlations with the target features are: TotalWorkingYears, JobLevel, Age, MonthlyIncome, StockOptionLevel, YearsInCurrentRole.

Other observations
1. 52% Single employees 
2. About 16.2% of leavers after 1-year of work.
3. 29% of leavers have distance from home of 1,2,3. Quite strange
4. 66% of leavers travel rarely

5. 55% of leaver work overtime.
6. 27% of leavers are Laboratory Technician
7. 38% of leavers are from life science
8. 41% of leavers have worked for 1 company


Machine Learning algorithms can typically only have numerical values as their predictor variables. Hence Label Encoding becomes necessary as they encode categorical labels with numerical values. To avoid introducing feature importance for categorical features with large numbers of unique values, we will use both Lable Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a label encoder object
# Label Encoding will be used for columns with 2 or less unique values

le = LabelEncoder()
le_count = 0
for col in train_data_copy.columns[1:]:
    if train_data_copy[col].dtype == 'object':
        if len(list(train_data_copy[col].unique())) <= 2:
            le.fit(train_data_copy[col])
            train_data_copy[col] = le.transform(train_data_copy[col])
            le_count = le_count + 1
print('{} columns were label encoded.'.format(le_count))



# convert rest of categorical variable into dummy
train_data_copy = pd.get_dummies(train_data_copy, drop_first=True)



In [None]:
print(train_data_copy.shape)
train_data_copy.head()

**Feature Scaling**

Feature Scaling using MinMaxScaler essentially shrinks the range such that the range is now between 0 and n. Machine Learning algorithms perform better when input numerical variables fall within a similar scale. In this case, we are scaling between 0 and 5.

In [None]:
# import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 5))
HR_col = list(train_data_copy.columns)
HR_col.remove('Attrition')
for col in HR_col:
    train_data_copy[col] = train_data_copy[col].astype(float)
    train_data_copy[[col]] = scaler.fit_transform(train_data_copy[[col]])
train_data_copy['Attrition'] = pd.to_numeric(train_data_copy['Attrition'], downcast='float')
train_data_copy.head()

**Splitting data into training and testing sets**

Prior to implementating or applying any Machine Learning algorithms, we must decouple training and testing datasets from our master dataframe.

In [None]:
# assign the target to a new dataframe and convert it to a numerical feature
#df_target = df_HR[['Attrition']].copy()
target = train_data_copy['Attrition'].copy()
train_data_copy = train_data_copy.drop(['Attrition'],axis=1)
train_data_copy.head()


In [None]:
# Since we have class imbalance (i.e. more employees with turnover=0 than turnover=1)
# let's use stratify=y to maintain the same ratio as in the training dataset when splitting the dataset
#20% of train as we test data_set contain 1 i.e ex-employee 
X_train, X_test, y_train, y_test = train_test_split(train_data_copy, target, test_size=0.25, random_state=7, stratify=target)
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

**Algorithms**

The algorithms considered are: Logistic Regression, Random Forest, SVM.

In [None]:
# Common sklearn Model Helpers
from sklearn import model_selection
# sklearn modules for preprocessing
from sklearn.model_selection import KFold
# Libraries for data modelling
# from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


**Logistic Regression**

Let's take a closer look at using the Logistic Regression algorithm. we will use 10 fold Cross-Validation to train our Logistic Regression Model and estimate its AUC score.



In [None]:
modelCV = LogisticRegression(solver='liblinear',random_state=7)
modelCV.fit(X_train,y_train)
# modelCV.fit(train_data_copy,target)
from sklearn.metrics import accuracy_score
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, modelCV.predict(X_test))))

# from sklearn.metrics import classification_report
# print(classification_report(y_test, modelCV.predict(X_test)))


In [None]:
# Logistic Regression fine_tuned
from sklearn.model_selection import GridSearchCV

param_grid = {'C': np.arange(1e-03, 2, 0.01)} # hyper-parameter list to fine-tune

log_gs = GridSearchCV(LogisticRegression(solver='liblinear',
                                         random_state=7),
                      iid=True,
                      return_train_score=True,
                      param_grid=param_grid,
                      scoring='roc_auc',
                      cv=10)

log_grid = log_gs.fit(X_train, y_train)
log_opt = log_grid.best_estimator_
results = log_gs.cv_results_


print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, log_opt.predict(X_test))))




**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# kfold = model_selection.KFold(n_splits=10, random_state=7)
rf = RandomForestClassifier()
# scoring = 'accuracy'
# results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
# print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

rf.fit(X_train, y_train)

print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))
# from sklearn.metrics import classification_report
# print(classification_report(y_test, rf.predict(X_test)))

**Support Vector Machine**

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(y_test, svc.predict(X_test))))

# from sklearn.metrics import classification_report
# print(classification_report(y_test, svc.predict(X_test)))

In [None]:
#test data signal processing

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a label encoder object
# Label Encoding will be used for columns with 2 or less unique values

le = LabelEncoder()
le_count = 0
for col in test_data_copy.columns[1:]:
    if test_data_copy[col].dtype == 'object':
        if len(list(test_data_copy[col].unique())) <= 2:
            le.fit(test_data_copy[col])
            test_data_copy[col] = le.transform(test_data_copy[col])
            le_count = le_count + 1



# convert rest of categorical variable into dummy
test_data_copy = pd.get_dummies(test_data_copy, drop_first=True)
# test_data_copy.head()



# import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 5))
HR_col = list(test_data_copy.columns)
# HR_col.remove('Attrition') commented bec since in test no attrition
for col in HR_col:
    test_data_copy[col] = test_data_copy[col].astype(float)
    test_data_copy[[col]] = scaler.fit_transform(test_data_copy[[col]])
test_data_copy.head()



In [None]:
#Evauation on test data

#1) Using Logistic Regression Model
modelCV.fit(X_train,y_train)
ans=modelCV.predict(test_data_copy)
#Saving prediction in csv
test_ID = test_data['ID']
submission = pd.DataFrame(ans,columns=['Attrition'])
submission=submission.astype(int)

output_file = pd.concat([test_ID,submission], axis=1, sort=False)
output_file.to_csv('183079037_LR.csv', index=False)

#2) Using Logistic Regression Model, fine tune
log_opt.fit(X_train,y_train)
ans=log_opt.predict(test_data_copy)
#Saving prediction in csv
test_ID = test_data['ID']
submission = pd.DataFrame(ans,columns=['Attrition'])
submission=submission.astype(int)

output_file = pd.concat([test_ID,submission], axis=1, sort=False)
output_file.to_csv('183079037_LR_finetune.csv', index=False)

#3) Using Random Forest
rf.fit(X_train,y_train)
ans=rf.predict(test_data_copy)
#Saving prediction in csv
test_ID = test_data['ID']
submission = pd.DataFrame(ans,columns=['Attrition'])
submission=submission.astype(int)

output_file = pd.concat([test_ID,submission], axis=1, sort=False)
output_file.to_csv('183079037_RF.csv', index=False)

#4) Using Support Vector Machine
svc.fit(X_train,y_train)
ans=svc.predict(test_data_copy)
#Saving prediction in csv
test_ID = test_data['ID']
submission = pd.DataFrame(ans,columns=['Attrition'])
submission=submission.astype(int)

output_file = pd.concat([test_ID,submission], axis=1, sort=False)
output_file.to_csv('183079037_SVM.csv', index=False)

In [None]:
from sklearn.metrics import auc, roc_auc_score, roc_curve, recall_score, log_loss
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, make_scorer


#modelCV
modelCV.fit(X_train, y_train) # fit optimised model to the training data
probs = modelCV.predict_proba(X_test) # predict probabilities
probs = probs[:, 1] # we will only keep probabilities associated with the employee leaving
logit_roc_auc = roc_auc_score(y_test, probs) # calculate AUC score using test dataset
print('AUC score: %.3f' % logit_roc_auc)

#finetune
log_opt.fit(X_train, y_train) # fit optimised model to the training data
probs = log_opt.predict_proba(X_test) # predict probabilities
probs = probs[:, 1] # we will only keep probabilities associated with the employee leaving
logit_roc_auc_fine = roc_auc_score(y_test, probs) # calculate AUC score using test dataset
print('AUC score: %.3f' % logit_roc_auc_fine)


# Create ROC Graph
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, modelCV.predict_proba(X_test)[:,1])
fine_fpr, fine_tpr, fine_thresholds = roc_curve(y_test, log_opt.predict_proba(X_test)[:,1])

plt.figure(figsize=(14, 6))

# Plot Logistic Regression ROC
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
# Plot fine tune
plt.plot(fine_fpr, fine_tpr, label='Logistic Regression fine tune(area = %0.2f)' % logit_roc_auc_fine)




# Plot Base Rate ROC
plt.plot([0,1], [0,1],label='Base Rate' 'k--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Graph')
plt.legend(loc="lower right")
plt.show()