In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("Dataset 2.csv")

In [3]:
le = preprocessing.LabelEncoder()

In [4]:
def encode_col(col_name) :
    enc = le.fit(df[col_name])
    new_col_name = "e_"+col_name
    df[new_col_name] = df[col_name].map(lambda x: enc.transform([x]))
    df[new_col_name] = df[new_col_name].map(lambda x:x[0])
    return

In [5]:
encode_col('Attrition')
encode_col('BusinessTravel')
encode_col('Department')
encode_col('EducationField')
encode_col('Gender')
encode_col('JobRole')
encode_col('MaritalStatus')
encode_col('Over18')
encode_col('OverTime')

In [6]:
X = df[["Age","e_BusinessTravel","DailyRate","e_Department","DistanceFromHome","Education","e_EducationField",'EmployeeCount',"EmployeeNumber","EnvironmentSatisfaction","e_Gender","HourlyRate","JobInvolvement","JobLevel","e_JobRole","JobSatisfaction","e_MaritalStatus","MonthlyIncome","MonthlyRate","NumCompaniesWorked","e_Over18","e_OverTime","PercentSalaryHike","PerformanceRating","RelationshipSatisfaction","StandardHours","StockOptionLevel","TotalWorkingYears","TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany","YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager"]]

In [7]:
y = df["e_Attrition"]

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.20,random_state = 42)

# XGBoost (lower learning rate)

In [293]:
xgb1 = XGBClassifier(n_estimators=200, learning_rate=0.01,
                        objective='binary:logistic', gamma=0.1, 
                        max_delta_step=0., subsample=0.8, colsample_bytree=0.9,
                        seed=42, reg_lambda=1, random_state=42, reg_alpha=1, 
                        min_child_weight=10, max_depth=4, eval_metric="mae")

In [294]:
xgb1.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, eval_metric='mae', gamma=0.1,
       learning_rate=0.01, max_delta_step=0.0, max_depth=4,
       min_child_weight=10, missing=None, n_estimators=200, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=42,
       reg_alpha=1, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
       subsample=0.8)

In [295]:
y_pred1 = xgb1.predict(X_test)

In [296]:
print(metrics.accuracy_score(y_test,y_pred1))

0.87074829932


In [297]:
trained_model1 = xgb1.fit(X_train,y_train)

In [298]:
print(metrics.accuracy_score(y_train,trained_model1.predict(X_train)))

0.871598639456


In [255]:
matrix1 = confusion_matrix(y_test,y_pred1)
print(matrix1)

[[252   3]
 [ 35   4]]


In [186]:
mean_squared_error(y_test,y_pred1)

0.12925170068027211

In [187]:
r2_score(y_test,y_pred1)

-0.12337858220211162

# XGBoost (higher learning rate)

In [18]:
xgb2 = XGBClassifier(learning_rate =0.2, n_estimators=1000, max_depth=5,
                         min_child_weight=1, gamma=0, subsample=0.8,
                         colsample_bytree=0.8, objective= 'binary:logistic',
                         nthread=4, scale_pos_weight=1, seed=0, reg_lambda=1)

In [19]:
xgb2.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8)

In [20]:
y_pred2 = xgb2.predict(X_test)

In [21]:
print(metrics.accuracy_score(y_test,y_pred2))

0.874149659864


In [22]:
trained_model2 = xgb2.fit(X_train,y_train)

In [23]:
print(metrics.accuracy_score(y_train,trained_model2.predict(X_train)))

1.0


In [24]:
matrix2 = confusion_matrix(y_test,y_pred2)
print(matrix2)

[[247   8]
 [ 29  10]]


In [25]:
mean_squared_error(y_test,y_pred2)

0.12585034013605442

In [26]:
r2_score(y_test,y_pred2)

-0.093815987933635103