In [1]:
import pandas as pd
pd.pandas.set_option('display.max_columns',None)

## ML

In [2]:
X = pd.read_csv('Data/X_train_enc.csv')
y = pd.read_csv('Data/y_train_enc.csv')

X_test = pd.read_csv('Data/X_test_enc.csv')
y_test = pd.read_csv('Data/y_test_enc.csv')

In [3]:
X.shape,y.shape,X_test.shape,y_test.shape

((1029, 43), (1029, 1), (441, 43), (441, 1))

In [4]:
from imblearn.over_sampling import ADASYN

In [5]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier

In [6]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,fbeta_score

In [8]:
rfc = RandomForestClassifier(random_state=1)
ada = AdaBoostClassifier(random_state=1)
gbc = GradientBoostingClassifier(random_state=1)
lr = LogisticRegression(random_state=1)
svc = SVC(random_state=1)
gnb = GaussianNB()
xgb = XGBClassifier(random_state=1)
dt = DecisionTreeClassifier(random_state=1)

In [9]:
estimators = [
    ('ada',ada),
    
    ('gbc',gbc),
    ('lr',lr),
    ('svc',svc),
    
    ('gnb',gnb),
    
    ('xgb',xgb),
    
    ('dt',dt)
]

In [11]:
stk_classifier = StackingClassifier(estimators=estimators, 
                                    final_estimator=RandomForestClassifier(n_estimators=100,random_state=1),
                                    passthrough=True,
                                    cv=5,
                                    n_jobs=-1,
                                    verbose=3)

In [12]:
accs = []
cms = []
for n in range(1,35):
    adasyn = ADASYN(n_neighbors=n,random_state=1)
    X_m, y_m = adasyn.fit_resample(X, y)
    stk_classifier.fit(X_m,y_m['Attrition'])
    
    y_pred = stk_classifier.predict(X_test)
    accs.append(accuracy_score(y_test,y_pred))
    cms.append(confusion_matrix(y_test,y_pred))
    
    print(n,"  --  ",accuracy_score(y_test,y_pred))

1   --   0.8707482993197279
2   --   0.8639455782312925
3   --   0.8684807256235828
4   --   0.8752834467120182
5   --   0.891156462585034
6   --   0.8843537414965986
7   --   0.8798185941043084
8   --   0.8888888888888888
9   --   0.8798185941043084
10   --   0.8752834467120182
11   --   0.8775510204081632
12   --   0.8775510204081632
13   --   0.8684807256235828
14   --   0.8752834467120182
15   --   0.8820861678004536
16   --   0.8866213151927438
17   --   0.8707482993197279
18   --   0.891156462585034
19   --   0.8775510204081632
20   --   0.8775510204081632
21   --   0.8707482993197279
22   --   0.8775510204081632
23   --   0.873015873015873
24   --   0.873015873015873
25   --   0.8639455782312925
26   --   0.8820861678004536
27   --   0.8866213151927438
28   --   0.8752834467120182
29   --   0.8752834467120182
30   --   0.8956916099773242
31   --   0.8798185941043084
32   --   0.8775510204081632
33   --   0.8843537414965986
34   --   0.873015873015873


In [None]:
for n in range(1,35):
    val = (cms[n-1][1][1]/(cms[n-1][1][0]+cms[n-1][1][1]))*100
    print(n,"   ---   ",val)

In [None]:
cms[28]

In [16]:
perf_dict = dict()
i=0
for ratio in [0.5,0.6,0.7,0.8,0.9,1]:
    arr = []
    for n in range(1,35):
        print(i)
        i = i+1
        adasyn = ADASYN(n_neighbors=n,random_state=1,sampling_strategy=ratio)
        X_m, y_m = adasyn.fit_resample(X, y)
        stk_classifier.fit(X_m,y_m['Attrition'])
    
        y_pred = stk_classifier.predict(X_test)
        acc = accuracy_score(y_test,y_pred)
        cm = confusion_matrix(y_test,y_pred)
        val = cm[1][1]/(cm[1][0]+cm[1][1])
        
        item = [acc,val,adasyn]
        arr.append(item)
    perf_dict[ratio] = arr

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203


In [17]:
my_list = []

for ratio in [0.5,0.6,0.7,0.8,0.9,1]:
    big = perf_dict[ratio][0][1]
    item = (ratio,0,perf_dict[ratio][0])
    for n in range(1,34):
        if big < perf_dict[ratio][n][1]:
            big = perf_dict[ratio][n][1]
            item = (ratio,n,perf_dict[ratio][n])
    my_list.append(item)

In [18]:
my_list

[(0.5,
  29,
  [0.8775510204081632,
   0.5070422535211268,
   ADASYN(n_neighbors=30, random_state=1, sampling_strategy=0.5)]),
 (0.6,
  33,
  [0.8820861678004536,
   0.4788732394366197,
   ADASYN(n_neighbors=34, random_state=1, sampling_strategy=0.6)]),
 (0.7,
  20,
  [0.891156462585034,
   0.5352112676056338,
   ADASYN(n_neighbors=21, random_state=1, sampling_strategy=0.7)]),
 (0.8,
  27,
  [0.8820861678004536,
   0.5070422535211268,
   ADASYN(n_neighbors=28, random_state=1, sampling_strategy=0.8)]),
 (0.9,
  24,
  [0.8866213151927438,
   0.4647887323943662,
   ADASYN(n_neighbors=25, random_state=1, sampling_strategy=0.9)]),
 (1,
  8,
  [0.8798185941043084,
   0.4647887323943662,
   ADASYN(n_neighbors=9, random_state=1, sampling_strategy=1)])]

In [None]:
## sampling_strategy = 0.7, n=20 got the best recall score and accuracy 

adasyn = ADASYN(n_neighbors=20,random_state=1,sampling_strategy=0.8)
X_m, y_m = adasyn.fit_resample(X, y)
stk_classifier.fit(X_m,y_m['Attrition'])
    
y_pred = stk_classifier.predict(X_test)
acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print(acc)
print(cm)

In [23]:
my_list[2]

(0.7,
 20,
 [0.891156462585034,
  0.5352112676056338,
  ADASYN(n_neighbors=21, random_state=1, sampling_strategy=0.7)])

In [25]:
adasyn = my_list[2][2][2]
X_m, y_m = adasyn.fit_resample(X, y)
stk_classifier.fit(X_m,y_m['Attrition'])
    
y_pred = stk_classifier.predict(X_test)
acc = accuracy_score(y_test,y_pred)
cm = confusion_matrix(y_test,y_pred)
print(acc)
print(cm)

0.891156462585034
[[355  15]
 [ 33  38]]


In [26]:
import pickle

In [27]:
pickle.dump(stk_classifier, open('stk_final2', 'wb'))
pickle.dump(adasyn, open('adasyn2', 'wb'))