In [21]:
import time
import pandas as pd
from sklearn.ensemble import (
    StackingClassifier,
    VotingClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    BaggingClassifier,
    RandomForestClassifier)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import category_encoders as ce
from sklearn.metrics import f1_score

In [22]:
data = pd.read_csv('./Employee.csv')
data.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [24]:
data['JoiningYear'] = data['JoiningYear'].max() - data['JoiningYear']

In [25]:
data['PaymentTier'] = data['PaymentTier'].astype(str)

In [26]:
X_train, X_test, y_train, y_test = (
    train_test_split(
        data.drop('LeaveOrNot', axis=1),
        data['LeaveOrNot'],
        test_size=0.33,
        random_state=42))

In [27]:
encoder = ce.TargetEncoder()

X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.transform(X_test)

In [28]:
scaler = StandardScaler().set_output(transform='pandas')

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
y_train.value_counts(normalize=True)

LeaveOrNot
0    0.657363
1    0.342637
Name: proportion, dtype: float64

In [30]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [31]:
y_res.value_counts(normalize=True)

LeaveOrNot
0    0.5
1    0.5
Name: proportion, dtype: float64

In [32]:
f1_scores = {}


def measure_f1_time_decorator(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        predictions = func(*args, **kwargs)
        end_time = time.time()
        f1 = f1_score(args[-1], predictions)
        model_name = args[0].__class__.__name__
        execution_time = end_time - start_time
        f1_scores[model_name] = [f1, execution_time]
        print(f'{model_name} F1 Metric: {f1:.4f}')
        print(f'{model_name} Inference: {execution_time:.4f} s')
        return predictions
    return wrapper


@measure_f1_time_decorator
def predict_with_measure(model, Xt, yt):
    return model.predict(Xt)

In [33]:
mod_log_reg = (LogisticRegression(
    # n_jobs=-1
).fit(X_res, y_res))

prd_log_reg = predict_with_measure(mod_log_reg, X_test, y_test)

LogisticRegression F1 Metric: 0.6376
LogisticRegression Inference: 0.0028 s


In [34]:
mod_rnd_frs = (RandomForestClassifier(
    random_state=42,
    # n_jobs=-1
)
    .fit(X_res, y_res))

prd_rnd_frs = predict_with_measure(mod_rnd_frs, X_test, y_test)

RandomForestClassifier F1 Metric: 0.7638
RandomForestClassifier Inference: 0.0161 s


In [35]:
mod_bag_knn = BaggingClassifier(
      KNeighborsClassifier(),
      max_samples=0.75,
      max_features=0.75,
      # n_jobs=-1,
      random_state=42
    ).fit(X_res, y_res)

prd_bag_knn = predict_with_measure(mod_bag_knn, X_test, y_test)

BaggingClassifier F1 Metric: 0.7603
BaggingClassifier Inference: 0.0763 s


In [36]:
mod_ada_bst = (AdaBoostClassifier(
    algorithm='SAMME',
    random_state=42)
    .fit(X_res, y_res))

prd_ada_bst = predict_with_measure(mod_ada_bst, X_test, y_test)

AdaBoostClassifier F1 Metric: 0.7273
AdaBoostClassifier Inference: 0.0035 s


In [37]:
mod_grd_bst = (GradientBoostingClassifier(
    learning_rate=0.3,
    subsample=0.75,
    max_features='sqrt',
    random_state=42)
    .fit(X_res, y_res))

prd_grd_bst = predict_with_measure(mod_grd_bst, X_test, y_test)

GradientBoostingClassifier F1 Metric: 0.7893
GradientBoostingClassifier Inference: 0.0024 s


In [38]:
clf1 = LogisticRegression()
clf2 = KNeighborsClassifier()
clf3 = GaussianNB()

estimators = [('lnr', clf1),
              ('knn', clf2),
              ('gnb', clf3)]

mod_vot_clf = VotingClassifier(
    estimators=estimators,
    voting='soft').fit(X_res, y_res)

prd_vot_clf = predict_with_measure(mod_vot_clf, X_test, y_test)

VotingClassifier F1 Metric: 0.6978
VotingClassifier Inference: 0.0248 s


In [39]:
final_estimator = GradientBoostingClassifier(
    subsample=0.75,
    max_features='sqrt',
    random_state=42)
2
mod_stk_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator).fit(X_res, y_res)

prd_stk_clf = predict_with_measure(mod_stk_clf, X_test, y_test)

StackingClassifier F1 Metric: 0.7343
StackingClassifier Inference: 0.0185 s


In [40]:
scores = pd.DataFrame.from_dict(
    f1_scores,
    orient='index',
    columns=['f1', 'time'])

scores.sort_values('f1', ascending=False)

Unnamed: 0,f1,time
GradientBoostingClassifier,0.789256,0.002427
RandomForestClassifier,0.76378,0.016095
BaggingClassifier,0.760281,0.076252
StackingClassifier,0.734345,0.018532
AdaBoostClassifier,0.727273,0.003525
VotingClassifier,0.697797,0.024803
LogisticRegression,0.637584,0.002777
