# 1. Imports 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

# 2. Load data 

In [3]:
train_df = pd.read_csv('../data/train.csv').drop(columns='id')
#We keep the test_df indexes for submission before droping them 
test_df = pd.read_csv('../data/test.csv')
test_idx = test_df ['id']
test_df = test_df.drop(columns='id')
print(f'Train dataframe shape: {train_df.shape}')
print(f'Test dataframe shape: {test_df.shape}')
TARGET = 'Attrition'
FEATURES = [col for col in train_df.columns if col not in ['row_id',TARGET]]

#People has demonstrated that incorporating original data, improves scores in the public leaderboard
original_df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv').drop(columns='EmployeeNumber')

Train dataframe shape: (1677, 34)
Test dataframe shape: (1119, 33)


# 3. Preprocess data 

In [4]:
# Drop unnecessary features

def preprocess_dataset(df): 
    DROP_FEATURES = ['EmployeeCount', 'Over18', 'StandardHours', 'JobRole', 'JobLevel', 'MaritalStatus','PerformanceRating']
    # Drop unnecessary features
    df = df.drop(columns=DROP_FEATURES)

    # Feature Engineering
    df['MonthlyIncome/Age'] = df['MonthlyIncome'] / df['Age']
    df["Age_risk"] = (df["Age"] < 34).astype(int)
    df["HourlyRate_risk"] = (df["HourlyRate"] < 60).astype(int)
    df["Distance_risk"] = (df["DistanceFromHome"] >= 20).astype(int)
    df["YearsAtCo_risk"] = (df["YearsAtCompany"] < 4).astype(int)
    df['NumCompaniesWorked'] = df['NumCompaniesWorked'].replace(0, 1)
    df['AverageTenure'] = df["TotalWorkingYears"] / df["NumCompaniesWorked"]
    df['JobHopper'] = ((df["NumCompaniesWorked"] > 2) & (df["AverageTenure"] < 2.0)).astype(int)
    df["AttritionRisk"] = df["Age_risk"] + df["HourlyRate_risk"] + df["Distance_risk"] + df["YearsAtCo_risk"] + df['JobHopper']
    return df

train_df = preprocess_dataset(train_df)
test_df = preprocess_dataset(test_df)

In [5]:
# Encode categorical features 
travel_dict = {
    'Non-Travel': 0, 
    'Travel_Rarely': 1,
    'Travel_Frequently': 2}
# Encode Business Travel ordinal feature
train_df['BusinessTravel'] = train_df['BusinessTravel'].map(travel_dict)
test_df['BusinessTravel'] = test_df['BusinessTravel'].map(travel_dict)

# Regarding the string categorical nominal features OneHotEncoding can be used with drop first parameter as true
for col in ['Department', 'EducationField','Gender','OverTime']:  
    enc = OneHotEncoder(drop='first', categories='auto')
    enc.fit(np.array(train_df[col]).reshape(-1,1))

    train_df[col] = enc.transform(np.array(train_df[col]).reshape(-1,1)).todense()
    test_df[col] = enc.transform(np.array(test_df[col]).reshape(-1,1)).todense()

# 4. Train baseline model and create submission_csv

In [6]:
X_train = train_df.loc[:, train_df.columns != 'Attrition']
y_train = train_df['Attrition']

lgbm = LGBMClassifier(objective='binary', metric = 'auc', n_estimators=100)
lgbm.fit(X_train, y_train)

LGBMClassifier(metric='auc', objective='binary')

0       1677
1       1678
2       1679
3       1680
4       1681
        ... 
1114    2791
1115    2792
1116    2793
1117    2794
1118    2795
Name: id, Length: 1119, dtype: int64

In [52]:
pd.DataFrame({
    'id': pd.read_csv('../data/sample_submission.csv')['id'],
    'Attrition':  lgbm.predict_proba(test_df.values)[:,1] }).to_csv('../data/first_submission.csv', index=False)