In [49]:
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import lightgbm
import xgboost
import sklearn
import catboost
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from pyod.models.ecod import ECOD
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.utils import class_weight

In [42]:
df_X = pd.read_csv("data/X_train_features_knn_150.csv", index_col=0)
df_Y = pd.read_csv("data/y_train.csv", index_col="id")
Y = df_Y.to_numpy().squeeze()

Remove highly correlated features

In [43]:
def rm_corr(X):
    corr_matrix = X.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    print("Removed columns: ", len(to_drop))
    # Drop features 
    X.drop(to_drop, axis=1, inplace=True)
    # X_test.drop(to_drop, axis=1, inplace=True)
    
    return X

In [44]:
df_X = rm_corr(df_X)

Removed columns:  38


Normalize

In [45]:
df_X.replace([np.inf, -np.inf], np.nan, inplace=True)
transformer = RobustScaler()
X = transformer.fit_transform(df_X)

Simple Outlier Removal

In [46]:
for i in range(3):
    class_i = np.squeeze(Y==i)
    ids = np.where(class_i)[0]
    X_c = X[class_i]
    # preds = IsolationForest(random_state=0, contamination = 0.06).fit_predict(X_c)
    clf = ECOD(contamination=0.01)
    clf.fit(X_c)
    preds = clf.predict(X_c)
    outlier_ids = ids[preds == 1] # minus 1 for isolation forest
    Y = np.delete(Y, outlier_ids, axis=0)
    X = np.delete(X, outlier_ids, axis=0)
    print(outlier_ids.shape[0])

31
5
15


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y,test_size=.5)

classifiers = [("LGBMClassifier", lightgbm.LGBMClassifier), ("XGBClassifier", xgboost.XGBClassifier), \
               ("RandomForestClassifier", sklearn.ensemble.RandomForestClassifier), \
                ("ExtraTreesClassifier", sklearn.ensemble.ExtraTreesClassifier)]
# catboost classifier takes very long but is similar in performance to these other classifiers
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) # classifiers=classifiers
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:39<00:00,  1.38s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearDiscriminantAnalysis,0.84,0.76,,0.83,0.19
XGBClassifier,0.86,0.76,,0.86,6.13
LogisticRegression,0.84,0.76,,0.84,0.25
GaussianNB,0.76,0.76,,0.76,0.06
LinearSVC,0.84,0.75,,0.84,3.19
BernoulliNB,0.74,0.75,,0.74,0.06
LGBMClassifier,0.86,0.74,,0.85,3.43
Perceptron,0.8,0.74,,0.8,0.09
SGDClassifier,0.82,0.73,,0.81,0.34
SVC,0.85,0.73,,0.85,1.83


In [50]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(Y), y=Y)
weights = dict()
for i in range(4):
    weights[i] = class_weights[i]

In [52]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
f1_voting = []
f1_xgb = []

clf1 = xgboost.XGBClassifier()
clf2 = RandomForestClassifier(n_estimators=200, min_samples_split=5, bootstrap=False, class_weight=weights)
clf3 = lightgbm.LGBMClassifier()
clf4 = ExtraTreesClassifier()
clf5 = catboost.CatBoostClassifier()

for train_index, test_index in skf.split(X, Y):
    clf = xgboost.XGBClassifier()
    
    eclf1 = VotingClassifier(estimators=[
    ('xgb', clf1), ('rf', clf2), ('lgbm', clf3), ('extra_trees', clf4), ('cat', clf5)], voting='hard')
    
    eclf1.fit(X[train_index], Y[train_index])
    y_pred = eclf1.predict(X[test_index])
    f1_voting.append(f1_score(Y[test_index], y_pred, average='micro'))
    
    clf.fit(X[train_index], Y[train_index])
    y_pred = clf.predict(X[test_index])
    f1_xgb.append(f1_score(Y[test_index], y_pred, average='micro'))

Learning rate set to 0.08498
0:	learn: 1.2592914	total: 228ms	remaining: 3m 47s
1:	learn: 1.1640907	total: 309ms	remaining: 2m 34s
2:	learn: 1.0858235	total: 387ms	remaining: 2m 8s
3:	learn: 1.0199770	total: 458ms	remaining: 1m 54s
4:	learn: 0.9578884	total: 518ms	remaining: 1m 43s
5:	learn: 0.9103258	total: 580ms	remaining: 1m 36s
6:	learn: 0.8717111	total: 650ms	remaining: 1m 32s
7:	learn: 0.8331329	total: 709ms	remaining: 1m 27s
8:	learn: 0.7974592	total: 772ms	remaining: 1m 25s
9:	learn: 0.7681228	total: 834ms	remaining: 1m 22s
10:	learn: 0.7407277	total: 894ms	remaining: 1m 20s
11:	learn: 0.7150302	total: 951ms	remaining: 1m 18s
12:	learn: 0.6933765	total: 1s	remaining: 1m 16s
13:	learn: 0.6719452	total: 1.06s	remaining: 1m 14s
14:	learn: 0.6527331	total: 1.12s	remaining: 1m 13s
15:	learn: 0.6359532	total: 1.18s	remaining: 1m 12s
16:	learn: 0.6191035	total: 1.24s	remaining: 1m 11s
17:	learn: 0.6048023	total: 1.3s	remaining: 1m 10s
18:	learn: 0.5911021	total: 1.36s	remaining: 1m 10

162:	learn: 0.2723739	total: 10.3s	remaining: 52.8s
163:	learn: 0.2713895	total: 10.3s	remaining: 52.7s
164:	learn: 0.2707324	total: 10.4s	remaining: 52.6s
165:	learn: 0.2697375	total: 10.4s	remaining: 52.5s
166:	learn: 0.2685829	total: 10.5s	remaining: 52.4s
167:	learn: 0.2678040	total: 10.6s	remaining: 52.3s
168:	learn: 0.2673438	total: 10.6s	remaining: 52.2s
169:	learn: 0.2668786	total: 10.7s	remaining: 52.1s
170:	learn: 0.2665425	total: 10.7s	remaining: 52s
171:	learn: 0.2660300	total: 10.8s	remaining: 51.9s
172:	learn: 0.2653990	total: 10.8s	remaining: 51.8s
173:	learn: 0.2645632	total: 10.9s	remaining: 51.7s
174:	learn: 0.2642427	total: 11s	remaining: 51.6s
175:	learn: 0.2633785	total: 11s	remaining: 51.5s
176:	learn: 0.2628056	total: 11.1s	remaining: 51.4s
177:	learn: 0.2621563	total: 11.1s	remaining: 51.4s
178:	learn: 0.2617820	total: 11.2s	remaining: 51.3s
179:	learn: 0.2610830	total: 11.2s	remaining: 51.2s
180:	learn: 0.2603841	total: 11.3s	remaining: 51.1s
181:	learn: 0.2592

322:	learn: 0.1904034	total: 20.4s	remaining: 42.9s
323:	learn: 0.1897386	total: 20.5s	remaining: 42.8s
324:	learn: 0.1893284	total: 20.6s	remaining: 42.7s
325:	learn: 0.1889315	total: 20.6s	remaining: 42.6s
326:	learn: 0.1885681	total: 20.7s	remaining: 42.6s
327:	learn: 0.1881047	total: 20.7s	remaining: 42.5s
328:	learn: 0.1878188	total: 20.8s	remaining: 42.4s
329:	learn: 0.1874020	total: 20.9s	remaining: 42.4s
330:	learn: 0.1870796	total: 20.9s	remaining: 42.3s
331:	learn: 0.1867699	total: 21s	remaining: 42.2s
332:	learn: 0.1862253	total: 21s	remaining: 42.1s
333:	learn: 0.1860167	total: 21.1s	remaining: 42.1s
334:	learn: 0.1857542	total: 21.2s	remaining: 42s
335:	learn: 0.1853758	total: 21.2s	remaining: 41.9s
336:	learn: 0.1850339	total: 21.3s	remaining: 41.9s
337:	learn: 0.1844942	total: 21.3s	remaining: 41.8s
338:	learn: 0.1840197	total: 21.4s	remaining: 41.7s
339:	learn: 0.1836125	total: 21.5s	remaining: 41.7s
340:	learn: 0.1832834	total: 21.5s	remaining: 41.6s
341:	learn: 0.1831

482:	learn: 0.1408360	total: 30s	remaining: 32.2s
483:	learn: 0.1404669	total: 30.1s	remaining: 32.1s
484:	learn: 0.1402061	total: 30.2s	remaining: 32s
485:	learn: 0.1397579	total: 30.2s	remaining: 32s
486:	learn: 0.1394973	total: 30.3s	remaining: 31.9s
487:	learn: 0.1391987	total: 30.3s	remaining: 31.8s
488:	learn: 0.1389170	total: 30.4s	remaining: 31.8s
489:	learn: 0.1387638	total: 30.5s	remaining: 31.7s
490:	learn: 0.1385856	total: 30.5s	remaining: 31.6s
491:	learn: 0.1382957	total: 30.6s	remaining: 31.6s
492:	learn: 0.1381248	total: 30.6s	remaining: 31.5s
493:	learn: 0.1380357	total: 30.7s	remaining: 31.4s
494:	learn: 0.1378250	total: 30.8s	remaining: 31.4s
495:	learn: 0.1377103	total: 30.8s	remaining: 31.3s
496:	learn: 0.1374422	total: 30.9s	remaining: 31.2s
497:	learn: 0.1372330	total: 30.9s	remaining: 31.2s
498:	learn: 0.1369927	total: 31s	remaining: 31.1s
499:	learn: 0.1368950	total: 31s	remaining: 31s
500:	learn: 0.1367423	total: 31.1s	remaining: 31s
501:	learn: 0.1364639	tota

643:	learn: 0.1083281	total: 40s	remaining: 22.1s
644:	learn: 0.1079582	total: 40s	remaining: 22s
645:	learn: 0.1078163	total: 40.1s	remaining: 22s
646:	learn: 0.1077366	total: 40.2s	remaining: 21.9s
647:	learn: 0.1075200	total: 40.3s	remaining: 21.9s
648:	learn: 0.1073811	total: 40.3s	remaining: 21.8s
649:	learn: 0.1072795	total: 40.4s	remaining: 21.7s
650:	learn: 0.1071527	total: 40.4s	remaining: 21.7s
651:	learn: 0.1069627	total: 40.5s	remaining: 21.6s
652:	learn: 0.1068469	total: 40.6s	remaining: 21.6s
653:	learn: 0.1067426	total: 40.6s	remaining: 21.5s
654:	learn: 0.1065760	total: 40.7s	remaining: 21.4s
655:	learn: 0.1063960	total: 40.8s	remaining: 21.4s
656:	learn: 0.1061268	total: 40.8s	remaining: 21.3s
657:	learn: 0.1057686	total: 40.9s	remaining: 21.3s
658:	learn: 0.1057236	total: 40.9s	remaining: 21.2s
659:	learn: 0.1055336	total: 41s	remaining: 21.1s
660:	learn: 0.1053316	total: 41.1s	remaining: 21.1s
661:	learn: 0.1052580	total: 41.1s	remaining: 21s
662:	learn: 0.1050139	to

803:	learn: 0.0856478	total: 49.7s	remaining: 12.1s
804:	learn: 0.0855258	total: 49.7s	remaining: 12s
805:	learn: 0.0853535	total: 49.8s	remaining: 12s
806:	learn: 0.0852065	total: 49.8s	remaining: 11.9s
807:	learn: 0.0850757	total: 49.9s	remaining: 11.9s
808:	learn: 0.0849332	total: 50s	remaining: 11.8s
809:	learn: 0.0848326	total: 50s	remaining: 11.7s
810:	learn: 0.0847207	total: 50.1s	remaining: 11.7s
811:	learn: 0.0845876	total: 50.1s	remaining: 11.6s
812:	learn: 0.0844881	total: 50.2s	remaining: 11.5s
813:	learn: 0.0843576	total: 50.3s	remaining: 11.5s
814:	learn: 0.0841561	total: 50.3s	remaining: 11.4s
815:	learn: 0.0839965	total: 50.4s	remaining: 11.4s
816:	learn: 0.0839145	total: 50.4s	remaining: 11.3s
817:	learn: 0.0838608	total: 50.5s	remaining: 11.2s
818:	learn: 0.0837177	total: 50.6s	remaining: 11.2s
819:	learn: 0.0836318	total: 50.6s	remaining: 11.1s
820:	learn: 0.0834710	total: 50.7s	remaining: 11s
821:	learn: 0.0833775	total: 50.7s	remaining: 11s
822:	learn: 0.0833140	to

962:	learn: 0.0685120	total: 59.2s	remaining: 2.27s
963:	learn: 0.0684380	total: 59.2s	remaining: 2.21s
964:	learn: 0.0683682	total: 59.3s	remaining: 2.15s
965:	learn: 0.0683066	total: 59.3s	remaining: 2.09s
966:	learn: 0.0682128	total: 59.4s	remaining: 2.03s
967:	learn: 0.0680916	total: 59.5s	remaining: 1.97s
968:	learn: 0.0679818	total: 59.5s	remaining: 1.9s
969:	learn: 0.0678835	total: 59.6s	remaining: 1.84s
970:	learn: 0.0678211	total: 59.6s	remaining: 1.78s
971:	learn: 0.0677288	total: 59.7s	remaining: 1.72s
972:	learn: 0.0676406	total: 59.7s	remaining: 1.66s
973:	learn: 0.0675762	total: 59.8s	remaining: 1.6s
974:	learn: 0.0675034	total: 59.9s	remaining: 1.53s
975:	learn: 0.0673579	total: 59.9s	remaining: 1.47s
976:	learn: 0.0672578	total: 60s	remaining: 1.41s
977:	learn: 0.0671858	total: 1m	remaining: 1.35s
978:	learn: 0.0670967	total: 1m	remaining: 1.29s
979:	learn: 0.0670014	total: 1m	remaining: 1.23s
980:	learn: 0.0669277	total: 1m	remaining: 1.17s
981:	learn: 0.0668419	total:

ValueError: could not broadcast input array from shape (1014,1) into shape (1014,)

In [25]:
# voting classifier against default xgb
print("Ours | Average: ", np.mean(f1_voting), "Std: ", np.std(f1_voting))
print("Theirs | Average: ", np.mean(f1_xgb), "Std: ", np.std(f1_xgb))

Ours | Average:  0.8560996980087268 Std:  0.004425082193263558
Theirs | Average:  0.8553105486661566 Std:  0.004285067992494936


In [53]:
Y.shape

(5066,)