In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
df = pd.read_csv("../data/HR_comma_sep.csv")
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [4]:
df['Department'].value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: Department, dtype: int64

In [5]:
df['left'] = df['left'].astype('category')

In [6]:
df['salary'].value_counts()

low       7316
medium    6446
high      1237
Name: salary, dtype: int64

In [7]:
ohe = OneHotEncoder(drop = 'first', sparse = True)
ohe_sparsematrix = ohe.fit_transform(df[['Department']])
ohe_df = pd.DataFrame(data = ohe_sparsematrix.toarray(), columns = ohe.get_feature_names())
ohe_df.head()

Unnamed: 0,x0_RandD,x0_accounting,x0_hr,x0_management,x0_marketing,x0_product_mng,x0_sales,x0_support,x0_technical
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
df = pd.concat([df, ohe_df], axis = 1)
df.drop(columns = ['Department'], inplace = True)
df['salary'] = df['salary'].replace({'low':0, 'medium':1, 'high':2})
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,x0_RandD,x0_accounting,x0_hr,x0_management,x0_marketing,x0_product_mng,x0_sales,x0_support,x0_technical
0,0.38,0.53,2,157,3,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.8,0.86,5,262,6,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.11,0.88,7,272,4,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.72,0.87,5,223,5,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.37,0.52,2,159,3,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [9]:
X = df.drop(columns=['left'])
y = df['left']
print("df shape: {}".format(df.shape))
print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))

df shape: (14999, 18)
X shape: (14999, 17)
y shape: (14999,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)

In [11]:
clf = LogisticRegression(max_iter = 1000)
clf.fit(X_train, y_train)
accuracy = accuracy_score(y_true=y_test, y_pred = clf.predict(X_test))
print("accuracy: %.2f"%accuracy)
f1 = f1_score(y_true = y_test, y_pred = clf.predict(X_test))
print("f1_score: %.2f"%f1)

accuracy: 0.79
f1_score: 0.45


In [13]:
clf = LogisticRegression(solver = 'liblinear', max_iter = 1000)
clf.fit(X_train, y_train)
accuracy = accuracy_score(y_true=y_test, y_pred = clf.predict(X_test))
print("accuracy: %.2f"%accuracy)
f1 = f1_score(y_true = y_test, y_pred = clf.predict(X_test))
print("f1_score: %.2f"%f1)

accuracy: 0.79
f1_score: 0.45


In [17]:
clf = LogisticRegression(solver = 'newton-cg', max_iter = 1000)
clf.fit(X_train, y_train)
accuracy = accuracy_score(y_true=y_test, y_pred = clf.predict(X_test))
print("accuracy: %.2f"%accuracy)
f1 = f1_score(y_true = y_test, y_pred = clf.predict(X_test))
print("f1_score: %.2f"%f1)

accuracy: 0.79
f1_score: 0.44


In [18]:
clf = LogisticRegression(max_iter = 1000, class_weight='balanced')
clf.fit(X_train, y_train)
accuracy = accuracy_score(y_true=y_test, y_pred = clf.predict(X_test))
print("accuracy: %.2f"%accuracy)
f1 = f1_score(y_true = y_test, y_pred = clf.predict(X_test))
print("f1_score: %.2f"%f1)

accuracy: 0.77
f1_score: 0.64
