Hello fellow kagglers!

Today we will learn what to do to keep your employee from leaving. As target is "Attrition" and it means that one will quit his job if he is very exhausted.

In [10]:
!pip install lightgbm



In [11]:
import os
from datetime import datetime
from IPython.display import Audio, display
import numpy as np
import pandas as pd
import seaborn as sns
from category_encoders.leave_one_out import LeaveOneOutEncoder
import sklearn
from sklearn.manifold import TSNE
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_predict, KFold, StratifiedKFold, RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, Lasso, LogisticRegressionCV, LassoCV, ElasticNetCV, RidgeCV, LarsCV, Ridge
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor
import matplotlib.pyplot as plt
import matplotlib as mpl
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
import tensorflow as tf
import tensorflow.keras.layers as layers
from scipy import special

%matplotlib inline






In [12]:
class conf:
    index = 'id'
    target = 'Attrition'
    random = 2023
    
    load_original = True
np.random.seed(conf.random)

# Load data and quick look

In [14]:
train_full = pd.read_csv("./dataset/IBM HR Analytics Employee Attrition & Performance/WA_Fn-UseC_-HR-Employee-Attrition.csv", index_col=conf.index)
if conf.load_original:
    original = pd.read_csv("/dataset/IBM HR Analytics Employee Attrition & Performance/WA_Fn-UseC_-HR-Employee-Attrition.csv")
    original.drop(["EmployeeNumber"], axis=1, inplace=True)
    original.Attrition = original.Attrition.apply(lambda x: 1 if x == "Yes" else 0)
    train_full = pd.concat([train_full, original], axis=0, ignore_index=True)
test_full = pd.read_csv("/kaggle/input/playground-series-s3e3/test.csv", index_col=conf.index)
print(f"Dataset size:     {len(train_full)}")
print(f"Columns with nan: {train_full.isna().any().sum()}")

ValueError: Index id invalid

In [None]:
train_full.info()

In [None]:
train_full.nunique()

In [None]:
train_full.nunique()[train_full.nunique() == 1]

In [None]:
train_full.drop(["EmployeeCount", "Over18", "StandardHours"], axis=1, inplace=True)
test_full.drop(["EmployeeCount", "Over18", "StandardHours"], axis=1, inplace=True)

num_cols = ["Age", "DailyRate", "DistanceFromHome", "HourlyRate", "MonthlyIncome", "MonthlyRate",
            "HourlyRate", "NumCompaniesWorked", "PercentSalaryHike", "TotalWorkingYears",
            "TrainingTimesLastYear", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion",
            "YearsWithCurrManager"]
cat_cols = train_full.drop([conf.target], axis=1).columns.difference(num_cols)
cat_cols

In [None]:
train_full[num_cols].describe()

In [None]:
train_full[num_cols].head()

In [None]:
train_full[cat_cols].head()

Insights:
* Small dataset.
* No nan values
* 33 columns! (exclude target)
* 3 columns-constants with only 1 unique value: "EmployeeCount", "Over18", "StandardHours". I think we should not use them in models.


# Target distribution

In [None]:
plt.figure(figsize=(9,9),)
plt.title("Attrition")
plt.pie(train_full[conf.target].value_counts(), labels = ["No", "Yes"], autopct='%.0f%%', textprops={'fontsize': 14},)
plt.show()

# Numerical distribution

In [None]:
fig, ax = plt.subplots(5, 3, figsize = (20, 5*5))
for i, col in enumerate(num_cols):
    sns.kdeplot(data=train_full, x=col, fill = True, ax = ax[i // 3][i % 3], hue=conf.target)

# Categorical distribution
The left column shows the distribution of the feature. The right one shows the probability of the target at a given value

In [None]:
fig, ax = plt.subplots(16, 2, figsize = (20, 6*16))
plt.subplots_adjust(wspace=0.33, hspace=0.2)
for i, col in enumerate(cat_cols):
    t = (train_full.groupby(col)[conf.target].agg(['sum', 'count']).assign(percent = lambda x: 100 * x['sum'] / x['count'])).reset_index()
    sns.countplot(data=train_full, x=col, ax = ax[i][0], order=t[col])
    ax[i][1].set(xlim=(0, 40))
    sns.barplot(data=t, x="percent", y=col, ax = ax[i][1], orient = "h",)
    

Insights
* Classes are imbalanced. 14% vs 86%
* Traveling is exhausting.
* Environment must satisfy.
* Better education, better stability
* Men leaves a bit more often then women
* Job involvement should be high to retain employees
* Low job level? Low stok option level? No time for my hobbies after work? I'm leaving from here now!
* We need new Sales representative again this week
* Job must satisfy.
* If you are single - you search the best job. Married - a good one. Divorced - job doesn't matter
* No one likes overtime

# Dimensionality reduction
Lets reduce dimensions to 2 and try to find any clusters

In [None]:
tr = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OrdinalEncoder(), cat_cols),
])
x_tn = tr.fit_transform(train_full.drop([conf.target], axis=1), train_full[conf.target])

tnse = TSNE(n_components=2, random_state=conf.random, perplexity=35, )
train_dis = tnse.fit_transform(x_tn)

In [None]:
plt.figure(figsize=(12,9))
sns.scatterplot(x=train_dis[:,0], y=train_dis[:,1], hue=train_full[conf.target], alpha=0.9, )
plt.show()

It's a butterfly! =)

We can see areas with a higher density of orange dots, but no clear clusters.

# Prepare data

In [None]:
x_train = train_full.copy()
y_train = x_train.pop(conf.target)

tr_clf = ColumnTransformer([
    ("num", SimpleImputer(), num_cols),
    ("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols),
])
tr_reg = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols),
])

print("train shape = ", x_train.shape)

In [None]:
logs = {}
def cross_val_auc(name, pipeline, x=x_train, y=y_train, regressor=False, use_reg=False):
    if regressor:
        x = tr_reg.fit_transform(x)
        p = cross_val_predict(pipeline, x, y, cv=10, n_jobs=5, verbose=False)
    else:
        if use_reg:
            x = tr_reg.fit_transform(x)
        else:
            x = tr_clf.fit_transform(x)
        p = cross_val_predict(pipeline, x, y, cv=10, n_jobs=5, method="predict_proba", verbose=False)[:, 1]
    auc = roc_auc_score(y, p)
    logs[name] = auc
    print(f"auc = {auc:.4f}")

# Linear models

LogisticRegression

In [None]:
lr = LogisticRegressionCV(Cs=1000, max_iter=1000)
cross_val_auc("LogisticRegression", lr, use_reg=True)

Lasso

In [None]:
lasso = LassoCV(alphas=np.linspace(0.0001, 100, 1000))
cross_val_auc("Lasso", lasso, regressor=True)

Ridge

In [None]:
ridge = RidgeCV(alphas=np.linspace(0.0001, 100, 1000))
cross_val_auc("Ridge", ridge, regressor=True)

ElasticNet

In [None]:
en = ElasticNetCV(alphas=np.linspace(0.0001, 100, 1000), max_iter=10000)
cross_val_auc("ElasticNet", en, regressor=True)

# Trees

DecisionTreeClassifier

In [None]:
cross_val_auc("DecisionTreeClassifier", DecisionTreeClassifier(random_state=conf.random))

DecisionTreeRegressor

In [None]:
cross_val_auc("DecisionTreeRegressor", DecisionTreeRegressor(random_state=conf.random), regressor=True)

RandomForestClassifier

In [None]:
cross_val_auc("RandomForestClassifier", RandomForestClassifier(random_state=conf.random))

RandomForestRegressor

In [None]:
cross_val_auc("RandomForestRegressor", RandomForestRegressor(random_state=conf.random), regressor=True)

ExtraTreesClassifier

In [None]:
cross_val_auc("ExtraTreesClassifier", ExtraTreesClassifier(random_state=conf.random))

ExtraTreesRegressor

In [None]:
cross_val_auc("ExtraTreesRegressor", ExtraTreesRegressor(random_state=conf.random), regressor=True)

# XGB

In [None]:
cross_val_auc("XGBClassifier", XGBClassifier(random_state=conf.random))

In [None]:
cross_val_auc("XGBRegressor", XGBRegressor(random_state=conf.random), regressor=True)

# CatBoost

In [None]:
cross_val_auc("CatBoostClassifier", CatBoostClassifier(verbose = False, random_state=conf.random))

In [None]:
cross_val_auc("CatBoostRegressor", CatBoostRegressor(verbose=False, random_state=conf.random), regressor=True)

# LGBM

In [None]:
cross_val_auc("LGBMClassifier", LGBMClassifier(random_state=conf.random))

In [None]:
cross_val_auc("LGBMRegressor", LGBMRegressor(random_state=conf.random), regressor=True)

# Models stats

In [None]:
logs_df = pd.DataFrame(logs.items(), columns=['Model', 'Auc']).sort_values(by="Auc", ascending=False)
logs_df.Auc = logs_df.Auc.apply(lambda x: int(x*10000)/10000.0)
logs_df["Label"] = logs_df.apply(lambda x: x["Model"] + " " + str(x["Auc"]), axis=1)

plt.figure(figsize=(14,9))
ax = sns.barplot(data=logs_df, x="Auc", y="Model",)
ax.bar_label(ax.containers[0])
ax.set(xlim=(0, 1))

Looks like CatBoost and some linear models are better. But we searched linear and no other models. Let's do random search a little

# Search

DecisionTreeClassifier

In [None]:
params = {
    'max_depth': np.linspace(5, 50, 10, dtype=int),
    'min_samples_split': np.linspace(5, 50, 10, dtype=int),
    'min_samples_leaf': np.linspace(5, 500, 10, dtype=int), # it or below
    # 'model__min_weight_fraction_leaf': np.linspace(0.001, 0.1, 5),
    'max_leaf_nodes': np.linspace(5, 100, 10, dtype=int),
    'max_features': np.linspace(5, 31, 10, dtype=int),
}
search_dt = RandomizedSearchCV(DecisionTreeClassifier(random_state=conf.random), n_iter=100, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_dt.fit(tr_clf.fit_transform(x_train), y_train)
print(search_dt.best_params_)
print()
cross_val_auc("DecisionTreeClassifier searched", search_dt.best_estimator_)

DecisionTreeRegressor

In [None]:
search_dtr = RandomizedSearchCV(DecisionTreeRegressor(random_state=conf.random), n_iter=100, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_dtr.fit(tr_reg.fit_transform(x_train), y_train)
print(search_dtr.best_params_)
print()
cross_val_auc("DecisionTreeRegressor searched", search_dtr.best_estimator_, regressor=True)

RandomForestClassifier

In [None]:
search_r = RandomizedSearchCV(RandomForestClassifier(random_state=conf.random), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_r.fit(tr_clf.fit_transform(x_train), y_train)
print(search_r.best_params_)
print()
cross_val_auc("RandomForestClassifier searched", search_r.best_estimator_)

RandomForestRegressor

In [None]:
search_rr = RandomizedSearchCV(RandomForestRegressor(random_state=conf.random), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_rr.fit(tr_reg.fit_transform(x_train), y_train)
print(search_rr.best_params_)
print()
cross_val_auc("RandomForestRegressor searched", search_rr.best_estimator_, regressor=True)

ExtraTreesClassifier

In [None]:
search_et = RandomizedSearchCV(ExtraTreesClassifier(random_state=conf.random), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_et.fit(tr_clf.fit_transform(x_train), y_train)
print(search_et.best_params_)
print()
cross_val_auc("ExtraTreesClassifier searched", search_et.best_estimator_)

ExtraTreesRegressor

In [None]:
search_etr = RandomizedSearchCV(ExtraTreesRegressor(random_state=conf.random), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_etr.fit(tr_reg.fit_transform(x_train), y_train)
print(search_etr.best_params_)
print()
cross_val_auc("ExtraTreesRegressor searched", search_etr.best_estimator_, regressor=True)

XGB

In [None]:
params = {
    'learning_rate': np.linspace(0.001, 0.2, 10),
    'max_depth': np.linspace(3, 50, 10, dtype=int),
    'colsample_bytree': np.linspace(0.01, 0.99, 10),
    'min_child_weight': np.linspace(1, 100, 10, dtype=int),
    'gamma': np.linspace(0.01, 0.99, 10),
    'subsample': np.linspace(0.01, 0.99, 10)
}
search_x = RandomizedSearchCV(XGBClassifier(random_state=conf.random), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_x.fit(tr_clf.fit_transform(x_train), y_train)
print(search_x.best_params_)
print()
cross_val_auc("XGBClassifier searched", search_x.best_estimator_)

In [None]:
search_xr = RandomizedSearchCV(XGBRegressor(random_state=conf.random), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_xr.fit(tr_reg.fit_transform(x_train), y_train)
print(search_xr.best_params_)
print()
cross_val_auc("XGBRegressor searched", search_xr.best_estimator_, regressor=True)

CatBoost

In [None]:
params = {
    'learning_rate': np.linspace(0.001, 0.2, 10),
    'l2_leaf_reg': np.linspace(0.001, 10, 10),
    'max_depth': np.linspace(1, 5, 10, dtype=int),
    'min_data_in_leaf': np.linspace(1, 250, 10, dtype=int),
    "colsample_bylevel": np.linspace(0.001, 0.99, 10)
}
search_cb = RandomizedSearchCV(CatBoostClassifier(random_state=conf.random, verbose=False), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_cb.fit(tr_clf.fit_transform(x_train), y_train)
print(search_cb.best_params_)
print()
cross_val_auc("CatBoostClassifier searched", search_cb.best_estimator_)

In [None]:
search_cbr = RandomizedSearchCV(CatBoostRegressor(random_state=conf.random, verbose=False), n_iter=50, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
search_cbr.fit(tr_reg.fit_transform(x_train), y_train)
print(search_cbr.best_params_)
print()
cross_val_auc("CatBoostRegressor searched", search_cbr.best_estimator_, regressor=True)

LGBM. It produce a lot of warnings so I commented it. Warning appear even in LogisticRegressionCV cell. Auc score ~0.833

In [None]:
params = {
#     'num_rounds': np.linspace(100, 500, 10, dtype=int),
    'learning_rate': np.linspace(0.001, 0.2, 10),
    'num_leaves': np.linspace(10, 500, 10, dtype=int),
    'max_depth': np.linspace(1, 50, 10, dtype=int),
    'min_data_in_leaf': np.linspace(1, 250, 10, dtype=int),
    'lambda_l1': np.linspace(0.001, 0.99, 10),
    'lambda_l2': np.linspace(0.001, 0.99, 10),
    'bagging_fraction': np.linspace(0.001, 0.99, 10),
    'feature_fraction': np.linspace(0.001, 0.99, 10),
    'min_gain_to_split': np.linspace(1, 10, 10),
}
# search_l = RandomizedSearchCV(LGBMClassifier(random_state=conf.random, verbose=0), n_iter=20, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
# search_l.fit(tr_clf.fit_transform(x_train), y_train)
# print(search_l.best_params_)
# print()

In [None]:
# cross_val_auc("LGBMClassifier searched", search_l.best_estimator_)

In [None]:
# search_lr = RandomizedSearchCV(LGBMRegressor(random_state=conf.random, verbose=0), n_iter=20, param_distributions=params, scoring="roc_auc", n_jobs=-1, random_state=conf.random)
# search_lr.fit(tr_reg.fit_transform(x_train), y_train)
# print(search_lr.best_params_)
# print()

In [None]:
# cross_val_auc("LGBMRegressor searched", search_lr.best_estimator_, regressor=True)

# Models stats after RandomizedSearchCV

In [None]:
logs_df = pd.DataFrame(logs.items(), columns=['Model', 'Auc']).sort_values(by="Auc", ascending=False)
logs_df.Auc = logs_df.Auc.apply(lambda x: int(x*10000)/10000.0)
logs_df["Label"] = logs_df.apply(lambda x: x["Model"] + " " + str(x["Auc"]), axis=1)

plt.figure(figsize=(14,12))
ax = sns.barplot(data=logs_df, x="Auc", y="Model",)
ax.bar_label(ax.containers[0])
ax.set(xlim=(0, 1))

# Ensemble

In [None]:
def train_model(m, x, y, te, reg=False, verbose=True, eval=False, spec=False, use_reg=False):
    preds = []
    scores = []
    kf = StratifiedKFold(n_splits=12, shuffle=True, random_state=conf.random)
    if reg or use_reg:
        tr_reg.fit(x)
        te = tr_reg.transform(te)
    else:
        tr_clf.fit(x)
        te = tr_clf.transform(te) 
    for train_index, val_index in kf.split(x, y):
        x_t, x_val = x.iloc[train_index], x.iloc[val_index]
        y_t, y_val = y[train_index], y[val_index]
        
        if reg or use_reg:
            x_t = tr_reg.transform(x_t)
            x_val = tr_reg.transform(x_val)
        else:
            x_t = tr_clf.transform(x_t)
            x_val = tr_clf.transform(x_val)  

        if verbose:
            m.fit(x_t, y_t)
        else:
            if eval:
                m.fit(x_t, y_t, eval_set=(x_val, y_val), verbose=False)
            else:
                m.fit(x_t, y_t, verbose=False)

        test_preds = m.predict(te) if reg else m.predict_proba(te)[:, 1]        
        val_preds = m.predict(x_val) if reg else m.predict_proba(x_val)[:, 1]
        
        if spec:
            test_preds = special.expit(test_preds)
            val_preds = special.expit(val_preds)
        
        preds.append(test_preds)
        scores.append(roc_auc_score(y_val, val_preds))
    print(f'mean score: {np.mean(scores):.4f}')
    preds = np.array(preds).mean(0)
    pd.DataFrame(preds).hist(bins=25)
    return preds

In [None]:
cb_preds = train_model(search_cb.best_estimator_, x_train, y_train.to_numpy(), test_full)

In [None]:
cbr_preds = train_model(search_cbr.best_estimator_, x_train, y_train.to_numpy(), test_full, reg=True)

In [None]:
ridge = RidgeCV(alphas=np.linspace(0.0001, 100, 1000))
ridge.fit(tr_reg.fit_transform(x_train), y_train)
print(ridge.alpha_)
                
r_preds = train_model(Ridge(alpha=ridge.alpha_), x_train, y_train.to_numpy(), test_full, reg=True, spec=True)

In [None]:
lr = LogisticRegressionCV(Cs=1000, max_iter=1000)
lr.fit(tr_reg.fit_transform(x_train), y_train)
print(lr.C_)
lr_preds = train_model(LogisticRegression(C=lr.C_[0]), x_train, y_train.to_numpy(), test_full, use_reg=True)

In [None]:
test_preds = cb_preds * 0.25 + cbr_preds * 0.25 + r_preds * 0.25 + lr_preds * 0.25
test_preds

In [None]:
pd.DataFrame(test_preds).hist(bins=25, figsize=(16,9))

# Submission

In [None]:
ss = pd.read_csv("/kaggle/input/playground-series-s3e3/sample_submission.csv", index_col=conf.index)
ss[conf.target] = test_preds
ss.to_csv("submission.csv")
ss.head()