In [18]:
from sklearn.metrics import f1_score
import lazypredict
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import lightgbm
import xgboost
import sklearn
import catboost
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import IsolationForest
#from pyod.models.ecod import ECOD
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import KNNImputer

In [20]:
df_X_last = pd.read_csv("data/X_train_features.csv", index_col=0)
# df_X = pd.read_csv("data/X_train_features_new.csv", index_col=0)
df_X = pd.read_csv("data/X_train_features_knn_200.csv", index_col=0)
df_Y = pd.read_csv("data/y_train.csv", index_col="id")

Remove highly correlated features

In [21]:
def rm_corr(X):
    corr_matrix = X.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    print("Removed columns: ", len(to_drop))
    # Drop features 
    X.drop(to_drop, axis=1, inplace=True)
    # X_test.drop(to_drop, axis=1, inplace=True)
    
    return X

In [7]:
df_X = rm_corr(df_X)

Removed columns:  19


Normalize

In [10]:
df_X.replace([np.inf, -np.inf], np.nan, inplace=True)
transformer = RobustScaler()
X = transformer.fit_transform(df_X)

Impute missing values

In [7]:
imputer = KNNImputer(n_neighbors=200)
#imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
X = imputer.fit_transform(X)

In [8]:
X.shape

(5117, 204)

Simple Outlier Removal

In [311]:
Y = df_Y.to_numpy()
for i in range(3):
    class_i = np.squeeze(Y==i)
    ids = np.where(class_i)[0]
    X_c = X[class_i]
    # preds = IsolationForest(random_state=0, contamination = 0.06).fit_predict(X_c)
    clf = ECOD()
    clf.fit(X_c)
    preds = clf.predict(X_c)
    outlier_ids = ids[preds == 1] # minus 1 for isolation forest
    Y = np.delete(Y, outlier_ids, axis=0)
    X = np.delete(X, outlier_ids, axis=0)
    print(outlier_ids.shape[0])

303
45
148


In [9]:
X.shape

(5117, 111)

In [53]:
X = SelectKBest(f_classif, k=170).fit_transform(X, df_Y)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, df_Y,test_size=.5)

classifiers = [("LGBMClassifier", lightgbm.LGBMClassifier), ("XGBClassifier", xgboost.XGBClassifier), \
               ("RandomForestClassifier", sklearn.ensemble.RandomForestClassifier), \
                ("ExtraTreesClassifier", sklearn.ensemble.ExtraTreesClassifier)]
# catboost classifier takes very long but is similar in performance to these other classifiers
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) # classifiers=classifiers
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:36<00:00,  1.25s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LinearDiscriminantAnalysis,0.82,0.76,,0.82,0.13
XGBClassifier,0.84,0.74,,0.83,4.97
LGBMClassifier,0.84,0.74,,0.84,2.93
GaussianNB,0.76,0.73,,0.75,0.07
LinearSVC,0.82,0.72,,0.82,2.5
LogisticRegression,0.82,0.72,,0.82,0.19
NearestCentroid,0.75,0.72,,0.74,0.04
RandomForestClassifier,0.84,0.72,,0.83,2.98
SVC,0.83,0.72,,0.83,1.17
BernoulliNB,0.72,0.72,,0.72,0.05


In [15]:
Y = df_Y.to_numpy()
last = df_X_last.to_numpy()

In [16]:
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
f1_last = []
f1_ours = []

clf1 = xgboost.XGBClassifier()
clf2 = RandomForestClassifier()
clf3 = lightgbm.LGBMClassifier()

for train_index, test_index in skf.split(X, Y):
    clf = xgboost.XGBClassifier()
    
    eclf1 = VotingClassifier(estimators=[
    ('xgb', clf1), ('rf', clf2), ('lgbm', clf3)], voting='hard')
    
    eclf1.fit(X[train_index], Y[train_index])
    y_pred = eclf1.predict(X[test_index])
    f1_ours.append(f1_score(Y[test_index], y_pred, average='micro'))
    
    clf.fit(last[train_index], Y[train_index])
    y_pred = clf.predict(last[test_index])
    f1_last.append(f1_score(Y[test_index], y_pred, average='micro'))

In [17]:
# voting classifier
print("Ours | Average: ", np.mean(f1_ours), "Std: ", np.std(f1_ours))
print("Theirs | Average: ", np.mean(f1_last), "Std: ", np.std(f1_last))

Ours | Average:  0.8465907181695993 Std:  0.0045519911937564415
Theirs | Average:  0.8458077498778105 Std:  0.01066281107735506


In [65]:
# xgb
print("Ours | Average: ", np.mean(f1_ours), "Std: ", np.std(f1_ours))
print("Theirs | Average: ", np.mean(f1_last), "Std: ", np.std(f1_last))

Ours | Average:  0.840141396322092 Std:  0.006176959267631727
Theirs | Average:  0.8458077498778105 Std:  0.01066281107735506
