In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('labeled_data.csv')

In [3]:
data.head()

Unnamed: 0,EmployeeID,Gender,MaritalStatus,Education,EducationField,Department,JobRole,JobLevel,MonthlyIncome,BusinessTravel,DistanceFromHome,StockOptionLevel,NumCompaniesWorked,YearsSinceLastPromotion,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,PerformanceRating,Attrition
0,1,1,1,1,0,0,0,1,131160,1,6,0,1.0,0,2,3,2,0,0
1,2,1,0,0,0,1,1,1,41890,2,10,1,0.0,1,2,1,3,1,1
2,3,0,1,3,1,1,2,4,193280,2,17,3,1.0,0,1,1,0,0,0
3,4,0,1,4,0,1,3,3,83210,0,2,3,3.0,7,3,3,1,0,0
4,5,0,0,0,2,1,2,1,23420,1,10,2,4.0,0,3,0,1,0,0


In [4]:
x = data.drop(['EmployeeID', 'Attrition'], axis = 1)
y = data['Attrition']

In [8]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.30, random_state = 101)

In [23]:
RS = 101
MODELS = [
    LogisticRegression(random_state = RS, max_iter = 1000),
    DecisionTreeClassifier(random_state = RS),
    AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state = RS), algorithm = 'SAMME',random_state = RS),
    GradientBoostingClassifier(random_state = RS),
    XGBClassifier(random_state = RS),
    RandomForestClassifier(random_state = RS),
    AdaBoostClassifier(base_estimator = RandomForestClassifier(random_state = RS), algorithm='SAMME', random_state = RS),
    GaussianNB(),
    KNeighborsClassifier(),
    SVC(random_state = RS, probability = True),
    MLPClassifier(random_state = RS, max_iter = 1000)
]

In [24]:
NAMES = [
    'Logistic',
    'Decision Tree',
    'DT - AdaBoost',
    'Gradient Boost',
    'XGBoost',
    'Random Forest Classifier',
    'RFC - AdaBoost',
    'GaussianNB',
    'KNN',
    'SVC',
    'Multi-Layer Perceptron'
]

In [26]:
logreg_params= {
    "C": np.logspace(-1, 1, 10),
    "penalty": ["l1","l2"],
    "solver":['lbfgs', 'liblinear', 'sag', 'saga'],
    "max_iter":[1000]
}

dtree_params = {
    "min_samples_split" : range(10,500,20),
    "max_depth": np.linspace(1,20,4)
}

ada_param = {
    "n_estimators" : [10,20,40],
    "learning_rate" : np.linspace(0.01,0.1,5)
}

gb_param = {
    "n_estimators" : [10,20,40],
    "learning_rate" : np.linspace(0.01,0.1,5)
}
            
xgb_params = {
    "n_estimators": [10, 20, 40],
    "subsample": [ 0.6, 0.8, 1.0],
    "max_depth": [1,2,3,4],
    "learning_rate": [0.1,0.2, 0.3, 0.4, 0.5]
}

rfc_params = {
    "min_samples_split" : range(10,500,20),
    "max_depth": np.linspace(1,20,4)
}

ada_rfc_param = {
    "n_estimators" : [10,20,40],
    "learning_rate" : np.linspace(0.01,0.1,5)
}

NB_params = {
    "var_smoothing": np.logspace(0,-9, num=100)
}

knn_params= {
    "n_neighbors": np.linspace(1,19,10, dtype = int).tolist(),
    "weights": ["uniform","distance"],
    "metric": ["euclidean","manhattan"]}

svc_params= {
    "kernel" : ["rbf"],
    "gamma": [0.1, 1, 5, 10],
    "C": [1,10,50,100]}

mlpc_params = {
    "alpha": [0.1, 0.05, 0.01],
    "hidden_layer_sizes": [(10,10,10),(100,100,100),(100,100)],
    "solver" : ["lbfgs","adam","sgd"],
    "max_iter":[1000]
}

PARAMS = [
    logreg_params,
    dtree_params,
    ada_param,
    gb_param,
    xgb_params,
    rfc_params,
    ada_rfc_param,
    NB_params,
    knn_params,
    svc_params,
    mlpc_params
]

In [29]:
CV_TUNED = []
cv_result = {}
best_estimators = {}
for name, model,classifier_param in zip(NAMES, MODELS, PARAMS):
    clf = GridSearchCV(model, param_grid=classifier_param, cv = 10, scoring = "accuracy", n_jobs = -1,verbose = False)
    clf.fit(xtrain,ytrain)
    cv_result[name] = clf.best_score_
    CV_TUNED.append(cv_result[name])
    best_estimators[name] = clf.best_estimator_
    print(name + ' best score: ' + str(cv_result[name]))
print('\nHighest best score: ' + str(max(CV_TUNED)) + ' (' + NAMES[CV_TUNED.index(max(CV_TUNED))] + ')')



Logistic best score: 0.841860465116279
Decision Tree best score: 0.9056478405315614
DT - AdaBoost best score: 0.9634551495016612
Gradient Boost best score: 0.8534883720930233
XGBoost best score: 0.9308970099667775
Random Forest Classifier best score: 0.9186046511627908
RFC - AdaBoost best score: 0.9790697674418606
GaussianNB best score: 0.841860465116279
KNN best score: 0.9757475083056478
SVC best score: 0.9780730897009967
Multi-Layer Perceptron best score: 0.841860465116279

Highest best score: 0.9790697674418606 (RFC - AdaBoost)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [31]:
y.value_counts()

0    3605
1     695
Name: Attrition, dtype: int64

In [30]:
sm = SMOTE(random_state = 101)
os_xtrain, os_ytrain = sm.fit_sample(xtrain, ytrain)
os_train = pd.concat([pd.DataFrame(os_ytrain), pd.DataFrame(os_xtrain)], axis=1)

In [32]:
pd.Series(os_ytrain).value_counts()

1    2534
0    2534
Name: Attrition, dtype: int64

In [36]:
model = AdaBoostClassifier(base_estimator = RandomForestClassifier(random_state = RS), algorithm='SAMME', random_state = RS)
classifier_param = {
    "n_estimators" : [10,20,40],
    "learning_rate" : np.linspace(0.01,0.1,5)
}

clf = GridSearchCV(model, param_grid=classifier_param, cv = 10, scoring = "accuracy", n_jobs = -1,verbose = False)
clf.fit(os_xtrain,os_ytrain)
print(' best score after oversampling: ' + str(clf.best_score_))

 best score after oversampling: 0.9806663236429122


In [1]:
import joblib

In [11]:
data = pd.read_csv('fit_data.csv')

In [12]:
data[data['Attrition'] == 1]

Unnamed: 0,MonthlyIncome,MaritalStatus,NumCompaniesWorked,DistanceFromHome,JobSatisfaction,Attrition
1,41890,0,0.0,10,1,1
6,58130,0,2.0,11,2,1
12,57620,1,1.0,1,1,1
26,103330,2,3.0,1,2,1
28,68540,2,2.0,4,1,1
...,...,...,...,...,...,...
4273,21800,0,1.0,7,3,1
4278,71400,1,5.0,11,3,1
4280,51470,1,7.0,1,0,1
4283,24680,0,0.0,23,2,1


In [2]:
svc = joblib.load('svc_final')

In [14]:
svc.predict([[41890, 0, 0, 10, 1]])

array([1])