## Logistic regression

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
#for test train split
from sklearn.model_selection import train_test_split
#for Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [2]:
# setting the option to display all the columns
pd.set_option('display.max_columns', None)

In [3]:
data = pd.read_csv("attrition.csv")
data.head(10)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome
0,41,1,Travel_Rarely,Sales,Life Sciences,2,Female,3,2,4,Single,5993,Yes,8,1,6,4,0,5,1
1,49,0,Travel_Frequently,Research & Development,Life Sciences,3,Male,2,2,2,Married,5130,No,10,3,10,7,1,7,8
2,37,1,Travel_Rarely,Research & Development,Other,4,Male,2,1,3,Single,2090,Yes,7,3,0,0,0,0,2
3,33,0,Travel_Frequently,Research & Development,Life Sciences,4,Female,3,1,3,Married,2909,Yes,8,3,8,7,3,0,3
4,27,0,Travel_Rarely,Research & Development,Medical,1,Male,3,1,2,Married,3468,No,6,3,2,2,2,2,2
5,32,0,Travel_Frequently,Research & Development,Life Sciences,4,Male,3,1,4,Single,3068,No,8,2,7,7,3,6,2
6,59,0,Travel_Rarely,Research & Development,Medical,3,Female,4,1,1,Married,2670,Yes,12,2,1,0,0,0,3
7,30,0,Travel_Rarely,Research & Development,Life Sciences,4,Male,3,1,3,Divorced,2693,No,1,3,1,0,0,0,24
8,38,0,Travel_Frequently,Research & Development,Life Sciences,4,Male,2,3,3,Single,9526,No,10,3,9,7,1,8,23
9,36,0,Travel_Rarely,Research & Development,Medical,3,Male,3,2,3,Married,5237,No,17,2,7,7,7,7,27


In [4]:
# get summary and info of data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   Attrition                1470 non-null   int64 
 2   BusinessTravel           1470 non-null   object
 3   Department               1470 non-null   object
 4   EducationField           1470 non-null   object
 5   EnvironmentSatisfaction  1470 non-null   int64 
 6   Gender                   1470 non-null   object
 7   JobInvolvement           1470 non-null   int64 
 8   JobLevel                 1470 non-null   int64 
 9   JobSatisfaction          1470 non-null   int64 
 10  MaritalStatus            1470 non-null   object
 11  MonthlyIncome            1470 non-null   int64 
 12  OverTime                 1470 non-null   object
 13  TotalWorkingYears        1470 non-null   int64 
 14  WorkLifeBalance          1470 non-null  

In [5]:
data.describe()

Unnamed: 0,Age,Attrition,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,0.161224,2.721769,2.729932,2.063946,2.728571,6502.931293,11.279592,2.761224,7.008163,4.229252,2.187755,4.123129,9.192517
std,9.135373,0.367863,1.093082,0.711561,1.10694,1.102846,4707.956783,7.780782,0.706476,6.126525,3.623137,3.22243,3.568136,8.106864
min,18.0,0.0,1.0,1.0,1.0,1.0,1009.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,30.0,0.0,2.0,2.0,1.0,2.0,2911.0,6.0,2.0,3.0,2.0,0.0,2.0,2.0
50%,36.0,0.0,3.0,3.0,2.0,3.0,4919.0,10.0,3.0,5.0,3.0,1.0,3.0,7.0
75%,43.0,0.0,4.0,3.0,3.0,4.0,8379.0,15.0,3.0,9.0,7.0,3.0,7.0,14.0
max,60.0,1.0,4.0,4.0,5.0,4.0,19999.0,40.0,4.0,40.0,18.0,15.0,17.0,29.0


In [6]:
# check for nulls
display(data.isnull().any())


Age                        False
Attrition                  False
BusinessTravel             False
Department                 False
EducationField             False
EnvironmentSatisfaction    False
Gender                     False
JobInvolvement             False
JobLevel                   False
JobSatisfaction            False
MaritalStatus              False
MonthlyIncome              False
OverTime                   False
TotalWorkingYears          False
WorkLifeBalance            False
YearsAtCompany             False
YearsInCurrentRole         False
YearsSinceLastPromotion    False
YearsWithCurrManager       False
DistanceFromHome           False
dtype: bool

We can see there are no nulls in the data 

In [7]:
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Age,Attrition,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,DistanceFromHome,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,1,2,3,2,4,5993,8,1,6,4,0,5,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1
1,49,0,3,2,2,2,5130,10,3,10,7,1,7,8,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0
2,37,1,4,2,1,3,2090,7,3,0,0,0,0,2,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,0,1
3,33,0,4,3,1,3,2909,8,3,8,7,3,0,3,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1
4,27,0,1,3,1,2,3468,6,3,2,2,2,2,2,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0


In [8]:
# get frequency of values and their percent in attrition column
attrition_freq = data[['Attrition']].apply(lambda x: x.value_counts())
attrition_freq['frequency_percent'] = round((100 * attrition_freq / attrition_freq.sum()), 2)

print(attrition_freq)

   Attrition  frequency_percent
0       1233              83.88
1        237              16.12


## 1. Model using all the features 

In [9]:
#Separating Features and Target variables
X = data.drop(['Attrition'], axis=1)
y = data['Attrition']

In [10]:
# rescale features using standardization. Must for Logistic regression

from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
X = scale.fit_transform(X)

In [11]:
# split into train, test with stratification on target
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3, stratify = y, random_state = 100)  

In [12]:
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [13]:
model = LogisticRegression()

In [14]:
model.fit(X_train, y_train) #Train the Model


LogisticRegression()

In [15]:
y_pred = model.predict(X_test) #Use the Model for prediction


In [16]:
# form confusion matrix and find accuracy scores

c= confusion_matrix(y_test,y_pred)
c

array([[360,  10],
       [ 45,  26]], dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8752834467120182

In [18]:
# full report 
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       370
           1       0.72      0.37      0.49        71

    accuracy                           0.88       441
   macro avg       0.81      0.67      0.71       441
weighted avg       0.86      0.88      0.86       441



In [19]:
## precision for each class = tp/(tp+fp)   # how correct it is for positive
## recall = tp/(tp+fn)   # supposed to have been all positive
## accuracy = tp+tn/(tp+tn+fp+fn)

## Use features from EDA

In [20]:
data_cp = data[["Age", "JobLevel", "YearsAtCompany","Attrition"]].copy()
data_enc = pd.get_dummies(data_cp)
data_enc.head()

Unnamed: 0,Age,JobLevel,YearsAtCompany,Attrition
0,41,2,6,1
1,49,2,10,0
2,37,1,0,1
3,33,1,8,0
4,27,1,2,0


In [21]:
#Separating Features and Target variables
X = data_enc.drop(['Attrition'], axis=1)
y = data_enc['Attrition']

In [22]:
# rescale features using standardization. Must for Logistic regression

from sklearn.preprocessing import StandardScaler

scale = StandardScaler()
X = scale.fit_transform(X)

In [23]:
# split into train, test with stratification on target
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3, stratify = y, random_state = 100)  

In [24]:
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [25]:
model = LogisticRegression()

In [26]:
model.fit(X_train, y_train) #Train the Model


LogisticRegression()

In [34]:
X_test

array([[-1.41518107e+00, -9.61486392e-01, -8.17733947e-01],
       [ 8.34299820e-03,  8.45911301e-01,  1.46818898e+00],
       [ 8.84357809e-01, -5.77875453e-02,  4.88507728e-01],
       ...,
       [ 5.55852255e-01,  1.74961015e+00,  2.12130982e+00],
       [ 4.46350404e-01,  1.74961015e+00, -1.33289967e-03],
       [-6.48668110e-01,  8.45911301e-01, -9.81014156e-01]])

In [28]:
# form confusion matrix and find accuracy scores

c= confusion_matrix(y_test,y_pred)
c

array([[370,   0],
       [ 71,   0]], dtype=int64)

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8390022675736961

In [30]:
# full report 
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91       370
           1       0.00      0.00      0.00        71

    accuracy                           0.84       441
   macro avg       0.42      0.50      0.46       441
weighted avg       0.70      0.84      0.77       441



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
import pickle

with open("attrition.pkl", "wb") as file:
    pickle.dump(model, file)

In [35]:
import pickle

with open("attrition.pkl", "rb") as file:
    plm = pickle.load( file)

### We can see that Model 2 performs bit poorer than Model 1, but the diff is not large and Model 2 is much less complex 