In [42]:
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
import xgboost
import lightgbm
import catboost
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from pyod.models.ecod import ECOD
from sklearn.utils import class_weight
from sklearn.model_selection import StratifiedKFold

In [43]:
X = pd.read_csv("data/X_train_features_final.csv", index_col=0).to_numpy()
X_last = pd.read_csv("data/X_train_features.csv", index_col=0).to_numpy()
df_X_test = pd.read_csv("data/X_test_features_final.csv", index_col=0)
Y = pd.read_csv("data/y_train.csv", index_col="id").to_numpy().squeeze()

Class Weights

In [44]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(Y), y=Y)

In [45]:
weights = dict()
for i in range(4):
    weights[i] = class_weights[i]

Simple Outlier Removal

In [39]:
for i in range(3):
    class_i = np.squeeze(Y==i)
    ids = np.where(class_i)[0]
    X_c = X[class_i]
    clf = ECOD(contamination=0.01)
    clf.fit(X_c)
    preds = clf.predict(X_c)
    outlier_ids = ids[preds == 1] # minus 1 for isolation forest
    Y = np.delete(Y, outlier_ids, axis=0)
    X = np.delete(X, outlier_ids, axis=0)
    print(outlier_ids.shape[0])

31
5
15


In [46]:
clf1 = xgboost.XGBClassifier(use_label_encoder=False,n_estimators=700, max_depth=8, learning_rate=0.05)
clf2 = RandomForestClassifier(n_estimators=270, min_samples_split=8, bootstrap=False, class_weight=weights)
clf3 = lightgbm.LGBMClassifier(n_estimators=200, max_depth=8, learning_rate=0.1)
clf4 = ExtraTreesClassifier()
# clf5 = catboost.CatBoostClassifier(logging_level='Silent')

eclf1 = VotingClassifier(estimators=[
    ('xgb', clf1), ('rf', clf2), ('lgbm', clf3), ('extra_trees', clf4)], voting='hard')



In [47]:
skf = StratifiedKFold(n_splits=5)
f1_base = []
f1_optim = []

for train_index, test_index in skf.split(X, Y):
    clf = xgboost.XGBClassifier()
    
    clf.fit(X_last[train_index], Y[train_index])
    y_pred = clf.predict(X_last[test_index])
    f1_base.append(f1_score(Y[test_index], y_pred, average='micro'))
    
    eclf1.fit(X[train_index], Y[train_index])
    y_pred = eclf1.predict(X[test_index])
    f1_optim.append(f1_score(Y[test_index], y_pred, average='micro'))



In [48]:
print(f1_base)

[0.8564453125, 0.8369140625, 0.8563049853372434, 0.8299120234604106, 0.8494623655913978]


In [49]:
print(f1_optim)

[0.8486328125, 0.8544921875, 0.8543499511241447, 0.8514173998044967, 0.855327468230694]


In [52]:
print(np.std(f1_optim))

0.0024868177877665734


In [77]:
eclf1.fit(X, Y)
y_pred = eclf1.predict(X_test)

In [79]:
y_pred.shape

(3411,)

In [80]:
ids = np.arange(y_pred.shape[0])

In [81]:
pd.DataFrame({"id": ids, "y": y_pred}).to_csv("sub_voting_class.csv", index=False)