In [25]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt


In [55]:
df = pd.read_csv('employee.csv')

In [27]:
#data head
df.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [28]:
#Checking Data Dimensions
df.shape


(4653, 9)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [30]:
df.duplicated().sum()

np.int64(1889)

In [31]:
#looking for possible category in data set
for i in df.columns:
    if df[i].dtype == 'O':
        print("Columns",i,"Contains: ",df[i].unique())

Columns Education Contains:  ['Bachelors' 'Masters' 'PHD']
Columns City Contains:  ['Bangalore' 'Pune' 'New Delhi']
Columns Gender Contains:  ['Male' 'Female']
Columns EverBenched Contains:  ['No' 'Yes']


In [32]:
#Checking Number of Employees for each Education and Gender 
df.groupby(['Education','Gender'])['Education'].count()

Education  Gender
Bachelors  Female    1435
           Male      2166
Masters    Female     371
           Male       502
PHD        Female      69
           Male       110
Name: Education, dtype: int64

In [33]:
df[df['LeaveOrNot']==1].groupby(['Education','Gender'])['LeaveOrNot'].count()

Education  Gender
Bachelors  Female    698
           Male      431
Masters    Female    168
           Male      258
PHD        Female     18
           Male       27
Name: LeaveOrNot, dtype: int64

In [34]:
from sklearn.preprocessing import OrdinalEncoder

In [35]:
ord_encoder = OrdinalEncoder(categories=[['Bachelors','Masters','PHD'],['No','Yes']])

In [36]:
encoded_cols = ord_encoder.fit_transform(df[['Education','EverBenched']])

In [37]:
df['Education'],df['EverBenched']=encoded_cols[:,[0]],encoded_cols[:,[1]]

In [38]:
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,0.0,2017,Bangalore,3,34,Male,0.0,0,0
1,0.0,2013,Pune,1,28,Female,0.0,3,1
2,0.0,2014,New Delhi,3,38,Female,0.0,2,0
3,1.0,2016,Bangalore,3,27,Male,0.0,5,1
4,1.0,2017,Pune,3,24,Male,1.0,2,1
...,...,...,...,...,...,...,...,...,...
4648,0.0,2013,Bangalore,3,26,Female,0.0,4,0
4649,1.0,2013,Pune,2,37,Male,0.0,2,1
4650,1.0,2018,New Delhi,3,27,Male,0.0,5,1
4651,0.0,2012,Bangalore,3,30,Male,1.0,2,0


In [39]:
#One Hot Encoding of Remaining Categorical Columns
df = pd.get_dummies(df,columns=['City','Gender'],drop_first=True)

In [40]:
df[['City_New Delhi','City_Pune','Gender_Male']] = df[['City_New Delhi','City_Pune','Gender_Male']].astype(float)

In [41]:
df

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,EverBenched,ExperienceInCurrentDomain,LeaveOrNot,City_New Delhi,City_Pune,Gender_Male
0,0.0,2017,3,34,0.0,0,0,0.0,0.0,1.0
1,0.0,2013,1,28,0.0,3,1,0.0,1.0,0.0
2,0.0,2014,3,38,0.0,2,0,1.0,0.0,0.0
3,1.0,2016,3,27,0.0,5,1,0.0,0.0,1.0
4,1.0,2017,3,24,1.0,2,1,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
4648,0.0,2013,3,26,0.0,4,0,0.0,0.0,0.0
4649,1.0,2013,2,37,0.0,2,1,0.0,1.0,1.0
4650,1.0,2018,3,27,0.0,5,1,1.0,0.0,1.0
4651,0.0,2012,3,30,1.0,2,0,0.0,0.0,1.0


In [42]:
#Train Test Split
from sklearn.model_selection import train_test_split as tts
X= df.drop('LeaveOrNot',axis=1)
Y = df['LeaveOrNot']
x_train,x_test,y_train,y_test = tts(X,Y,test_size=0.2,random_state=4)


In [43]:
from sklearn.preprocessing import MinMaxScaler

In [44]:
scaler = MinMaxScaler()
scaler.fit(X_train)

In [45]:
x_train.describe()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,EverBenched,ExperienceInCurrentDomain,City_New Delhi,City_Pune,Gender_Male
count,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0,3722.0
mean,0.252821,2015.048092,2.699624,29.348737,0.101558,2.923966,0.240731,0.277539,0.602633
std,0.50875,1.864706,0.561264,4.822051,0.302107,1.561382,0.427585,0.447845,0.489419
min,0.0,2012.0,1.0,22.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2013.0,3.0,26.0,0.0,2.0,0.0,0.0,0.0
50%,0.0,2015.0,3.0,28.0,0.0,3.0,0.0,0.0,1.0
75%,0.0,2017.0,3.0,32.0,0.0,4.0,0.0,1.0,1.0
max,2.0,2018.0,3.0,41.0,1.0,7.0,1.0,1.0,1.0


In [46]:
x_test.describe()

Unnamed: 0,Education,JoiningYear,PaymentTier,Age,EverBenched,ExperienceInCurrentDomain,City_New Delhi,City_Pune,Gender_Male
count,931.0,931.0,931.0,931.0,931.0,931.0,931.0,931.0,931.0
mean,0.311493,2015.122449,2.692803,29.571429,0.107411,2.832438,0.280344,0.252417,0.574651
std,0.565749,1.857864,0.562389,4.840688,0.309802,1.544279,0.449409,0.434633,0.494662
min,0.0,2012.0,1.0,22.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2014.0,2.0,26.0,0.0,2.0,0.0,0.0,0.0
50%,0.0,2015.0,3.0,28.0,0.0,3.0,0.0,0.0,1.0
75%,1.0,2017.0,3.0,33.0,0.0,4.0,1.0,1.0,1.0
max,2.0,2018.0,3.0,41.0,1.0,7.0,1.0,1.0,1.0


In [47]:
x_train_scaled = scaler.transform(X_train)
x_test_scaled = scaler.transform(X_test)

In [48]:
x_train_scaled = pd.DataFrame(x_train_scaled,columns=x_train.columns).describe()
x_test_scaled = pd.DataFrame(x_test_scaled,columns=x_test.columns).describe()

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
model_LR = LogisticRegression()
model_LR.fit(X_train,Y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
LR_predicted = model_LR.predict(X_test)

In [52]:
from sklearn.metrics import confusion_matrix,accuracy_score
pd.DataFrame(confusion_matrix(y_test,LR_predicted))

Unnamed: 0,0,1
0,552,54
1,225,100


In [53]:
accuracy_score(y_test,LR_predicted)

0.7003222341568206

In [56]:
from sklearn.tree import DecisionTreeClassifier
DT_model = DecisionTreeClassifier()
DT_model.fit(x_train,y_train)
DT_predict = DT_model.predict(x_test)
pd.DataFrame(confusion_matrix(y_test,DT_predict))

Unnamed: 0,0,1
0,538,68
1,103,222


In [57]:
accuracy_score(y_test,DT_predict)

0.8163265306122449

In [58]:
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier

In [59]:
bag_model = BaggingClassifier()
bag_model.fit(x_train,y_train)
bag_predict = bag_model.predict(x_test)
pd.DataFrame(confusion_matrix(y_test,bag_predict))

Unnamed: 0,0,1
0,547,59
1,108,217


In [60]:
accuracy_score(y_test,bag_predict)

0.8206229860365198

In [61]:
gb_model = GradientBoostingClassifier()
gb_model.fit(x_train,y_train)
gb_predict = gb_model.predict(x_test)
pd.DataFrame(confusion_matrix(y_test,gb_predict))

Unnamed: 0,0,1
0,573,33
1,121,204


In [63]:
accuracy_score(y_test,gb_predict)

0.8345864661654135