In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
train_file_path = 'Train_Dataset.csv'
test_file_path = 'Test_Dataset.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [3]:
train_data['MaritalStatus'].value_counts()

MaritalStatus
Single      1776
Married     1614
Divorsed    1016
M            774
Name: count, dtype: int64

In [4]:
train_data.isna().sum()

EmployeeID            2630
Attrition             2630
Age                   2946
TravelProfile         2630
Department            2754
HomeToWork            2885
EducationField        2630
Gender                2676
HourlnWeek            2917
Involvement           2630
WorkLifeBalance       2630
Designation           2668
JobSatisfaction       2630
ESOPs                 2630
NumCompaniesWorked    2630
OverTime              2630
SalaryHikelastYear    2799
WorkExperience        2817
LastPromotion         2700
CurrentProfile        2941
MaritalStatus         2630
MonthlyIncome         2723
dtype: int64

In [5]:
train_data.dropna(subset=['EmployeeID'], thresh=1,inplace=True)

In [6]:
train_data.isna().sum()

EmployeeID              0
Attrition               0
Age                   316
TravelProfile           0
Department            124
HomeToWork            255
EducationField          0
Gender                 46
HourlnWeek            287
Involvement             0
WorkLifeBalance         0
Designation            38
JobSatisfaction         0
ESOPs                   0
NumCompaniesWorked      0
OverTime                0
SalaryHikelastYear    169
WorkExperience        187
LastPromotion          70
CurrentProfile        311
MaritalStatus           0
MonthlyIncome          93
dtype: int64

In [7]:
train_data.describe(percentiles=[.01,.25,.5,.75,0.9,.95,.99]).T

Unnamed: 0,count,mean,std,min,1%,25%,50%,75%,90%,95%,99%,max
EmployeeID,5180.0,5112590.0,1495.481528,5110001.0,5110052.79,5111295.75,5112590.5,5113885.25,5114662.1,5114921.05,5115128.21,5115180.0
Attrition,5180.0,0.2789575,0.44853,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
Age,4864.0,37.10855,9.248647,18.0,20.0,30.0,36.0,43.0,51.0,55.0,59.0,61.0
HomeToWork,4925.0,11.10741,8.455577,1.0,1.0,5.0,9.0,16.0,25.0,28.0,31.0,121.0
HourlnWeek,4893.0,57.97977,12.996674,10.0,31.0,49.0,59.0,67.0,75.0,79.0,82.0,99.0
Involvement,5180.0,3.226641,0.872431,1.0,1.0,3.0,3.0,4.0,4.0,5.0,5.0,5.0
WorkLifeBalance,5180.0,3.012741,1.410602,1.0,1.0,2.0,3.0,4.0,5.0,5.0,5.0,5.0
JobSatisfaction,5180.0,3.144402,1.342776,1.0,1.0,2.0,3.0,4.0,5.0,5.0,5.0,5.0
ESOPs,5180.0,0.4905405,0.499959,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
NumCompaniesWorked,5180.0,3.157336,2.606036,0.0,0.0,1.0,2.0,5.0,7.0,9.0,10.0,21.0


In [8]:
train_data.describe(include='object').T

Unnamed: 0,count,unique,top,freq
TravelProfile,5180,3,Rarely,3637
Department,5056,3,Analytics,3219
EducationField,5180,6,Statistics,2129
Gender,5134,3,Male,3094
Designation,5142,5,Executive,2072
MaritalStatus,5180,4,Single,1776


In [9]:
pd.DataFrame([train_data['TravelProfile'].value_counts(),
train_data['Department'].value_counts(),
train_data['EducationField'].value_counts(),
train_data['Gender'].value_counts(),
train_data['Designation'].value_counts(),
train_data['MaritalStatus'].value_counts()])

Unnamed: 0,Rarely,Yes,No,Analytics,Sales,Marketing,Statistics,CA,Marketing Diploma,Engineer,...,F,Executive,Manager,Senior Manager,AVP,VP,Single,Married,Divorsed,M
count,3637.0,1051.0,492.0,,,,,,,,...,,,,,,,,,,
count,,,,3219.0,1615.0,222.0,,,,,...,,,,,,,,,,
count,,,,,,,2129.0,1560.0,603.0,487.0,...,,,,,,,,,,
count,,,,,,,,,,,...,702.0,,,,,,,,,
count,,,,,,,,,,,...,,2072.0,1756.0,763.0,328.0,223.0,,,,
count,,,,,,,,,,,...,,,,,,,1776.0,1614.0,1016.0,774.0


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5180 entries, 0 to 5179
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EmployeeID          5180 non-null   float64
 1   Attrition           5180 non-null   float64
 2   Age                 4864 non-null   float64
 3   TravelProfile       5180 non-null   object 
 4   Department          5056 non-null   object 
 5   HomeToWork          4925 non-null   float64
 6   EducationField      5180 non-null   object 
 7   Gender              5134 non-null   object 
 8   HourlnWeek          4893 non-null   float64
 9   Involvement         5180 non-null   float64
 10  WorkLifeBalance     5180 non-null   float64
 11  Designation         5142 non-null   object 
 12  JobSatisfaction     5180 non-null   float64
 13  ESOPs               5180 non-null   float64
 14  NumCompaniesWorked  5180 non-null   float64
 15  OverTime            5180 non-null   float64
 16  SalaryHikel

In [11]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2630 entries, 0 to 2629
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EmployeeID          2630 non-null   int64  
 1   Age                 2488 non-null   float64
 2   TravelProfile       2630 non-null   object 
 3   Department          2572 non-null   object 
 4   HomeToWork          2504 non-null   float64
 5   EducationField      2630 non-null   object 
 6   Gender              2600 non-null   object 
 7   HourlnWeek          2494 non-null   float64
 8   Involvement         2630 non-null   int64  
 9   WorkLifeBalance     2630 non-null   int64  
 10  Designation         2600 non-null   object 
 11  JobSatisfaction     2630 non-null   int64  
 12  ESOPs               2630 non-null   int64  
 13  NumCompaniesWorked  2630 non-null   int64  
 14  OverTime            2630 non-null   int64  
 15  SalaryHikelastYear  2536 non-null   float64
 16  WorkEx

In [12]:
le = LabelEncoder()

In [13]:
for col in train_data.select_dtypes(include=['object']):
    train_data[col] = le.fit_transform(train_data[col].astype(str))
    if col in test_data.columns:
        test_data[col] = le.fit_transform(test_data[col].astype(str))

In [14]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5180 entries, 0 to 5179
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EmployeeID          5180 non-null   float64
 1   Attrition           5180 non-null   float64
 2   Age                 4864 non-null   float64
 3   TravelProfile       5180 non-null   int32  
 4   Department          5180 non-null   int32  
 5   HomeToWork          4925 non-null   float64
 6   EducationField      5180 non-null   int32  
 7   Gender              5180 non-null   int32  
 8   HourlnWeek          4893 non-null   float64
 9   Involvement         5180 non-null   float64
 10  WorkLifeBalance     5180 non-null   float64
 11  Designation         5180 non-null   int32  
 12  JobSatisfaction     5180 non-null   float64
 13  ESOPs               5180 non-null   float64
 14  NumCompaniesWorked  5180 non-null   float64
 15  OverTime            5180 non-null   float64
 16  SalaryHikel

In [15]:
train_data.head()

Unnamed: 0,EmployeeID,Attrition,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,...,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,5110001.0,0.0,35.0,1,0,5.0,0,2,69.0,1.0,...,1.0,1.0,1.0,1.0,20.0,7.0,2.0,,1,18932.0
1,5110002.0,1.0,32.0,2,2,5.0,5,1,62.0,4.0,...,2.0,0.0,8.0,0.0,20.0,4.0,1.0,,3,18785.0
2,5110003.0,0.0,31.0,1,0,5.0,5,0,45.0,5.0,...,2.0,1.0,3.0,0.0,26.0,12.0,1.0,3.0,3,22091.0
3,5110004.0,0.0,34.0,2,2,10.0,5,1,32.0,3.0,...,4.0,1.0,1.0,0.0,23.0,5.0,1.0,3.0,0,20302.0
4,5110005.0,0.0,37.0,0,0,27.0,5,1,49.0,3.0,...,4.0,1.0,8.0,0.0,21.0,12.0,1.0,9.0,0,21674.0


In [16]:
train_data.shape

(5180, 22)

In [17]:
train_data.isna().sum()

EmployeeID              0
Attrition               0
Age                   316
TravelProfile           0
Department              0
HomeToWork            255
EducationField          0
Gender                  0
HourlnWeek            287
Involvement             0
WorkLifeBalance         0
Designation             0
JobSatisfaction         0
ESOPs                   0
NumCompaniesWorked      0
OverTime                0
SalaryHikelastYear    169
WorkExperience        187
LastPromotion          70
CurrentProfile        311
MaritalStatus           0
MonthlyIncome          93
dtype: int64

In [18]:
test_data.isna().sum()

EmployeeID              0
Age                   142
TravelProfile           0
Department              0
HomeToWork            126
EducationField          0
Gender                  0
HourlnWeek            136
Involvement             0
WorkLifeBalance         0
Designation             0
JobSatisfaction         0
ESOPs                   0
NumCompaniesWorked      0
OverTime                0
SalaryHikelastYear     94
WorkExperience        122
LastPromotion          57
CurrentProfile        134
MaritalStatus           0
MonthlyIncome          33
dtype: int64

In [19]:
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

In [20]:
train_data.isna().sum()

EmployeeID            0
Attrition             0
Age                   0
TravelProfile         0
Department            0
HomeToWork            0
EducationField        0
Gender                0
HourlnWeek            0
Involvement           0
WorkLifeBalance       0
Designation           0
JobSatisfaction       0
ESOPs                 0
NumCompaniesWorked    0
OverTime              0
SalaryHikelastYear    0
WorkExperience        0
LastPromotion         0
CurrentProfile        0
MaritalStatus         0
MonthlyIncome         0
dtype: int64

In [21]:
train_data.describe(percentiles=[.25,.5,.75,.8,.9,.95,.99]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,95%,99%,max
EmployeeID,5180.0,5112590.0,1495.481528,5110001.0,5111295.75,5112590.5,5113885.25,5114144.2,5114662.1,5114921.05,5115128.21,5115180.0
Attrition,5180.0,0.2789575,0.44853,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
Age,5180.0,37.04093,8.965977,18.0,31.0,36.0,42.0,45.0,50.0,54.0,59.0,61.0
TravelProfile,5180.0,1.107915,0.535057,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0
Department,5180.0,0.7382239,0.98011,0.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0
HomeToWork,5180.0,11.00367,8.257384,1.0,5.0,9.0,15.0,18.0,25.0,28.0,31.0,121.0
EducationField,5180.0,2.762741,2.180032,0.0,0.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0
Gender,5180.0,1.479537,0.733818,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0
HourlnWeek,5180.0,58.03629,12.633586,10.0,50.0,59.0,67.0,69.0,75.0,79.0,82.0,99.0
Involvement,5180.0,3.226641,0.872431,1.0,3.0,3.0,4.0,4.0,4.0,5.0,5.0,5.0


In [22]:
# It is reducing the accuracy. So we will not do this
#train_data.drop(train_data[train_data['MonthlyIncome']==95000].index[0], axis=0, inplace=True)

In [23]:
train_data.describe(percentiles=[.25,.5,.75,.8,.9,.95,.99]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,80%,90%,95%,99%,max
EmployeeID,5180.0,5112590.0,1495.481528,5110001.0,5111295.75,5112590.5,5113885.25,5114144.2,5114662.1,5114921.05,5115128.21,5115180.0
Attrition,5180.0,0.2789575,0.44853,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
Age,5180.0,37.04093,8.965977,18.0,31.0,36.0,42.0,45.0,50.0,54.0,59.0,61.0
TravelProfile,5180.0,1.107915,0.535057,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0
Department,5180.0,0.7382239,0.98011,0.0,0.0,0.0,2.0,2.0,2.0,2.0,3.0,3.0
HomeToWork,5180.0,11.00367,8.257384,1.0,5.0,9.0,15.0,18.0,25.0,28.0,31.0,121.0
EducationField,5180.0,2.762741,2.180032,0.0,0.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0
Gender,5180.0,1.479537,0.733818,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0
HourlnWeek,5180.0,58.03629,12.633586,10.0,50.0,59.0,67.0,69.0,75.0,79.0,82.0,99.0
Involvement,5180.0,3.226641,0.872431,1.0,3.0,3.0,4.0,4.0,4.0,5.0,5.0,5.0


In [24]:
#train_data.loc(train_data['MonthlyIncome']==95000)
train_data[train_data['MonthlyIncome']==95000].index[0]

1528

In [25]:
train_data[train_data['MonthlyIncome']>36000].count()

EmployeeID            1
Attrition             1
Age                   1
TravelProfile         1
Department            1
HomeToWork            1
EducationField        1
Gender                1
HourlnWeek            1
Involvement           1
WorkLifeBalance       1
Designation           1
JobSatisfaction       1
ESOPs                 1
NumCompaniesWorked    1
OverTime              1
SalaryHikelastYear    1
WorkExperience        1
LastPromotion         1
CurrentProfile        1
MaritalStatus         1
MonthlyIncome         1
dtype: int64

In [26]:
train_data[train_data['NumCompaniesWorked']>19].count()

EmployeeID            1
Attrition             1
Age                   1
TravelProfile         1
Department            1
HomeToWork            1
EducationField        1
Gender                1
HourlnWeek            1
Involvement           1
WorkLifeBalance       1
Designation           1
JobSatisfaction       1
ESOPs                 1
NumCompaniesWorked    1
OverTime              1
SalaryHikelastYear    1
WorkExperience        1
LastPromotion         1
CurrentProfile        1
MaritalStatus         1
MonthlyIncome         1
dtype: int64

In [27]:
train_data.skew()

EmployeeID            0.000000
Attrition             0.986012
Age                   0.462018
TravelProfile         0.091400
Department            0.695080
HomeToWork            1.333220
EducationField       -0.209343
Gender               -0.891258
HourlnWeek           -0.234548
Involvement          -0.518513
WorkLifeBalance      -0.020089
Designation           0.620840
JobSatisfaction      -0.145593
ESOPs                 0.037856
NumCompaniesWorked    1.126677
OverTime              0.804191
SalaryHikelastYear    0.808023
WorkExperience        1.194552
LastPromotion         1.995481
CurrentProfile        0.883728
MaritalStatus        -0.457692
MonthlyIncome         2.025872
dtype: float64

In [28]:
# split features and target from train data
X = train_data.drop(columns=['EmployeeID', 'Attrition'])
y = train_data['Attrition']

# split train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# build the gradient boosting vlassifier
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
gb_model.fit(X_train, y_train)

# predictions on the validation set
y_val_pred_gb = gb_model.predict(X_val)

In [29]:
# accuracy for GBC
val_accuracy_gb = accuracy_score(y_val, y_val_pred_gb)

In [30]:
val_accuracy_gb

0.9932432432432432

In [31]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2630 entries, 0 to 2629
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EmployeeID          2630 non-null   int64  
 1   Age                 2630 non-null   float64
 2   TravelProfile       2630 non-null   int32  
 3   Department          2630 non-null   int32  
 4   HomeToWork          2630 non-null   float64
 5   EducationField      2630 non-null   int32  
 6   Gender              2630 non-null   int32  
 7   HourlnWeek          2630 non-null   float64
 8   Involvement         2630 non-null   int64  
 9   WorkLifeBalance     2630 non-null   int64  
 10  Designation         2630 non-null   int32  
 11  JobSatisfaction     2630 non-null   int64  
 12  ESOPs               2630 non-null   int64  
 13  NumCompaniesWorked  2630 non-null   int64  
 14  OverTime            2630 non-null   int64  
 15  SalaryHikelastYear  2630 non-null   float64
 16  WorkEx

In [32]:
# predictions on the test set using GBC
test_features = test_data.drop(columns=['EmployeeID'])
test_predictions_gb = gb_model.predict(test_features)

# submission dataframe for GBC
submission_df_gb = pd.DataFrame({
    'EmployeeID': test_data['EmployeeID'],
    'Attrition': test_predictions_gb
})

# Save the file for GBC
submission_file_path_gb = 'submission_attrition_model - '+str(val_accuracy_gb)[2:6]+'.csv'
submission_df_gb.to_csv(submission_file_path_gb, index=False)

val_accuracy_gb, submission_file_path_gb

(0.9932432432432432, 'submission_attrition_model.9932.csv')

In [33]:
str(val_accuracy_gb)[1:6]

'.9932'