In [1]:
import pandas as pd
pd.pandas.set_option('display.max_columns',None)

## ML

In [2]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')

X_test = pd.read_csv('Data/X_test_enc.csv')
y_test = pd.read_csv('Data/y_test_enc.csv')

In [3]:
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [4]:
from imblearn.over_sampling import ADASYN

In [5]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier

In [6]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,fbeta_score

In [7]:
import pickle

In [8]:
adasyn = pickle.load(open('../Test1/adasyn2','rb'))



In [10]:
rfc = RandomForestClassifier(random_state=1)
ada = AdaBoostClassifier(random_state=1)
gbc = GradientBoostingClassifier(random_state=1)
lr = LogisticRegression(random_state=1)
svc = SVC(random_state=1)
gnb = GaussianNB()
xgb = XGBClassifier(random_state=1)
dt = DecisionTreeClassifier(random_state=1)

In [11]:
estimators = [
    ('ada',ada),
    
    ('gbc',gbc),
    ('lr',lr),
    ('svc',svc),
    
    ('gnb',gnb),
    
    ('xgb',xgb),
    
    ('dt',dt)
]

In [12]:
stk_classifier = StackingClassifier(estimators=estimators, 
                                    final_estimator=RandomForestClassifier(n_estimators=100,random_state=1),
                                    passthrough=True,
                                    cv=5,
                                    n_jobs=-1,
                                    verbose=3)

In [12]:
from sklearn.model_selection import GridSearchCV

In [14]:
X_m,y_m = adasyn.fit_resample(X, y)

In [15]:
from sklearn.model_selection import GridSearchCV

## Adaboost

In [16]:
model = AdaBoostClassifier(random_state=1)
params = {
    'n_estimators':[5,10,20,50,100,150,300,500],
    'learning_rate':[0.1,0.5,1,2,5]
}

clf_ada = GridSearchCV(model,params,scoring='recall',n_jobs=-1,cv=3,verbose=4)
clf_ada.fit(X_m,y_m['Attrition'])
y_pred = clf_ada.predict(X_test)
print(accuracy_score(y_test,y_pred))
cm = confusion_matrix(y_test,y_pred)
val = cm[1][1]/(cm[1][0]+cm[1][1])
print(val)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
0.8503401360544217
0.6197183098591549


In [17]:
print(clf_ada.best_params_)

{'learning_rate': 1, 'n_estimators': 300}


## Gradient Boosting

In [31]:
model = GradientBoostingClassifier(random_state=1)
params = {
    'n_estimators':[5,10,20,50,70,100,150,300,500],
    'learning_rate':[0.1,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,2,5]
}

clf_gb = GridSearchCV(model,params,scoring='recall',n_jobs=-1,cv=3,verbose=4)
clf_gb.fit(X_m,y_m['Attrition'])
y_pred = clf_gb.predict(X_test)
print(accuracy_score(y_test,y_pred))
cm = confusion_matrix(y_test,y_pred)
val = cm[1][1]/(cm[1][0]+cm[1][1])
print(val)

Fitting 3 folds for each of 99 candidates, totalling 297 fits
0.8707482993197279
0.5492957746478874


In [32]:
print(clf_gb.best_params_)

{'learning_rate': 1.2, 'n_estimators': 500}


## LogisticRegression

In [20]:
model = LogisticRegression(random_state=1,max_iter=10000)
params = {
    'C':[0.01,0.1,0.5,1,2,5,10,15,20,30,40,50,100]
}

clf_lr = GridSearchCV(model,params,scoring='recall',n_jobs=-1,cv=3,verbose=4)
clf_lr.fit(X_m,y_m['Attrition'])
y_pred = clf_lr.predict(X_test)
print(accuracy_score(y_test,y_pred))
cm = confusion_matrix(y_test,y_pred)
val = cm[1][1]/(cm[1][0]+cm[1][1])
print(val)

Fitting 3 folds for each of 13 candidates, totalling 39 fits
0.8344671201814059
0.5774647887323944


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
print(clf_lr.best_params_)

{'C': 50}


## SVC

In [22]:
model = SVC(random_state=1)
params = {
    'C':[0.01,0.1,0.5,1,2,5,10,15,20,30,40,50,100]
}

clf_svc = GridSearchCV(model,params,scoring='recall',n_jobs=-1,cv=3,verbose=4)
clf_svc.fit(X_m,y_m['Attrition'])
y_pred = clf_svc.predict(X_test)
print(accuracy_score(y_test,y_pred))
cm = confusion_matrix(y_test,y_pred)
val = cm[1][1]/(cm[1][0]+cm[1][1])
print(val)

Fitting 3 folds for each of 13 candidates, totalling 39 fits
0.7732426303854876
0.30985915492957744


In [23]:
print(clf_svc.best_params_)

{'C': 100}


## GaussianNB

In [24]:
model = GaussianNB()
params = {
    'var_smoothing':[0.01,0.1,0.5,1,2,5,10,15,20,30,40,50,100]
}

clf_gnb = GridSearchCV(model,params,scoring='recall',n_jobs=-1,cv=3,verbose=4)
clf_gnb.fit(X_m,y_m['Attrition'])
y_pred = clf_gnb.predict(X_test)
print(accuracy_score(y_test,y_pred))
cm = confusion_matrix(y_test,y_pred)
val = cm[1][1]/(cm[1][0]+cm[1][1])
print(val)

Fitting 3 folds for each of 13 candidates, totalling 39 fits
0.5374149659863946
0.7183098591549296


In [25]:
print(clf_gnb.best_params_)

{'var_smoothing': 0.01}


## XGBClassifier

In [26]:
model = XGBClassifier(random_state=1)
params = {
    'n_estimators':[5,10,20,50,100,150,300,500],
    'max_depth':[2,3,4,5,6,7,8],
    'learning_rate':[0.01,0.1,0.5,1,2,5,10,15]
}

clf_xgb = GridSearchCV(model,params,scoring='recall',n_jobs=-1,cv=3,verbose=4)
clf_xgb.fit(X_m,y_m['Attrition'])
y_pred = clf_xgb.predict(X_test)
print(accuracy_score(y_test,y_pred))
cm = confusion_matrix(y_test,y_pred)
val = cm[1][1]/(cm[1][0]+cm[1][1])
print(val)

Fitting 3 folds for each of 448 candidates, totalling 1344 fits




0.8662131519274376
0.49295774647887325


In [27]:
print(clf_xgb.best_params_)

{'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 300}


## DecisionTreeClassifier

In [28]:
model = DecisionTreeClassifier(random_state=1)
params = {
    'criterion':['gini','entropy'],
    'splitter':['best','random'],
    'max_depth':[2,3,4,5,6,7,8,9,10,11],
    'min_samples_split':[2,3,4,5,6],
    'max_leaf_nodes':[None,1,2,3,4,5,6,7,8,9,10]
}

clf_dt = GridSearchCV(model,params,scoring='recall',n_jobs=-1,cv=3,verbose=4)
clf_dt.fit(X_m,y_m['Attrition'])
y_pred = clf_dt.predict(X_test)
print(accuracy_score(y_test,y_pred))
cm = confusion_matrix(y_test,y_pred)
val = cm[1][1]/(cm[1][0]+cm[1][1])
print(val)

Fitting 3 folds for each of 2200 candidates, totalling 6600 fits
0.7573696145124716
0.29577464788732394




In [29]:
print(clf_dt.best_params_)

{'criterion': 'gini', 'max_depth': 11, 'max_leaf_nodes': None, 'min_samples_split': 3, 'splitter': 'random'}


In [33]:
ada = clf_ada.best_estimator_
gbc = clf_gb.best_estimator_
lr = clf_lr.best_estimator_
svc = clf_svc.best_estimator_
gnb = clf_gnb.best_estimator_
xgb = clf_xgb.best_estimator_
dt = clf_dt.best_estimator_

In [34]:
estimators = [
    ('ada',ada),
    
    ('gbc',gbc),
    ('lr',lr),
    ('svc',svc),
    
    ('gnb',gnb),
    
    ('xgb',xgb),
    
    ('dt',dt)
]

In [36]:
my_list = []
i = 0
for n in [50,100,200,300,500]:
    for max_d in [None,3,4,5,6,7,8]:
        for min_sam_sp in [2,3,4,5,6]:
            for max_leaf_n in [None,2,3,4,5,6,7,8]:
                meta_model = RandomForestClassifier(n_estimators=n,
                                                    max_depth=max_d,
                                                    min_samples_split=min_sam_sp,
                                                    max_leaf_nodes=max_leaf_n,
                                                    random_state=1)
                stk_classifier = StackingClassifier(estimators=estimators, 
                                                final_estimator=meta_model,
                                                passthrough=True,
                                                cv=5,
                                                n_jobs=-1,
                                                verbose=3)

                stk_classifier.fit(X_m,y_m['Attrition'])
                y_pred = stk_classifier.predict(X_test)

                acc = accuracy_score(y_test,y_pred)
                cm = confusion_matrix(y_test,y_pred)
                recall = cm[1][1]/(cm[1][0]+cm[1][1])

                item = (n,acc,recall,stk_classifier)

                my_list.append(item)
                print(i,recall)
                i = i+1

KeyboardInterrupt: 

In [43]:
best_recall = None
best_i=None
for i in range(len(my_list)):
    item = my_list[i]
    recall = item[2]
    if best_recall is None or recall > best_recall:
        best_recall = recall
        best_i = i

In [45]:
best_recall,best_i

(0.5492957746478874, 296)

In [48]:
my_stk_clr = my_list[296][3]

In [49]:
y_pred = my_stk_clr.predict(X_test)
acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print(acc)
print(cm)

0.8775510204081632
[[348  22]
 [ 32  39]]


In [50]:
import pickle

In [52]:
pickle.dump(my_stk_clr,open('stk_final3','wb'))

In [53]:
y_prob_pred = my_stk_clr.predict_proba(X_test)

In [71]:
y_pred = [item[1]>0.5 for item in y_prob_pred]
acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print(acc)
print(cm)

0.8775510204081632
[[348  22]
 [ 32  39]]


## Finding model with best accuracy

In [72]:
best_acc = None
best_i=None
for i in range(len(my_list)):
    item = my_list[i]
    acc = item[1]
    if best_acc is None or acc > best_acc:
        best_acc = acc
        best_i = i

In [73]:
best_acc,best_i

(0.891156462585034, 359)

In [74]:
my_stk_clr = my_list[359][3]

In [75]:
y_pred = my_stk_clr.predict(X_test)
acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print(acc)
print(cm)
recall = cm[1][1]/(cm[1][0]+cm[1][1])
print(recall)

0.891156462585034
[[356  14]
 [ 34  37]]
0.5211267605633803


In [76]:
pickle.dump(my_stk_clr,open('stk_final3_best_accuracy','wb'))

In [38]:
import pickle,pandas as pd

In [39]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [40]:
X_test = pd.read_csv('Data/X_test_enc.csv')
y_test = pd.read_csv('Data/y_test_enc.csv')

In [41]:
stk_clr = pickle.load(open('stk_final3_best_accuracy','rb'))

In [42]:
type(stk_clr)

sklearn.ensemble._stacking.StackingClassifier

In [43]:
stk_clr

StackingClassifier(cv=5,
                   estimators=[('ada',
                                AdaBoostClassifier(learning_rate=1,
                                                   n_estimators=300,
                                                   random_state=1)),
                               ('gbc',
                                GradientBoostingClassifier(learning_rate=1,
                                                           n_estimators=300,
                                                           random_state=1)),
                               ('lr',
                                LogisticRegression(C=50, max_iter=10000,
                                                   random_state=1)),
                               ('svc', SVC(C=100, random_state=1)),
                               ('gnb', GaussianNB(var_smoothing=0.01)),
                               ('xgb',
                                XGBClassifie...
                                              random_s

In [9]:
y_pred = stk_clr.predict(X_test)
acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
cr = classification_report(y_test,y_pred)
print(acc)
print(cm)
print(cr)

0.891156462585034
[[356  14]
 [ 34  37]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.94       370
           1       0.73      0.52      0.61        71

    accuracy                           0.89       441
   macro avg       0.82      0.74      0.77       441
weighted avg       0.88      0.89      0.88       441

