In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
import numpy as np
import xgboost
import lightgbm
import catboost
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from pyod.models.ecod import ECOD
from sklearn.utils import class_weight

In [2]:
df_X = pd.read_csv("data/X_train_features_knn_150.csv", index_col=0)
df_X_test = pd.read_csv("data/X_test_features_knn_150.csv", index_col=0)
df_Y = pd.read_csv("data/y_train.csv", index_col="id")
Y = df_Y.to_numpy().squeeze()

Remove highly correlated features

In [3]:
def rm_corr(X, X_test):
    corr_matrix = X.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation greater than 0.9
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    print("Removed columns: ", len(to_drop))
    # Drop features 
    X.drop(to_drop, axis=1, inplace=True)
    X_test.drop(to_drop, axis=1, inplace=True)
    
    return X, X_test

In [4]:
df_X, df_X_test = rm_corr(df_X, df_X_test)

Removed columns:  38


Normalize

In [5]:
transformer = RobustScaler()
X = transformer.fit_transform(df_X)
X_test = transformer.transform(df_X_test)

In [6]:
pd.DataFrame(X).to_csv("data/X_train_features_final.csv")
pd.DataFrame(X_test).to_csv("data/X_test_features_final.csv")

Simple Outlier Removal

In [74]:
for i in range(3):
    class_i = np.squeeze(Y==i)
    ids = np.where(class_i)[0]
    X_c = X[class_i]
    clf = ECOD(contamination=0.01)
    clf.fit(X_c)
    preds = clf.predict(X_c)
    outlier_ids = ids[preds == 1] # minus 1 for isolation forest
    Y = np.delete(Y, outlier_ids, axis=0)
    X = np.delete(X, outlier_ids, axis=0)
    print(outlier_ids.shape[0])

31
5
15


In [75]:
# xgb best params {'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 600}

Class weights

In [76]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(Y), y=Y)
weights = dict()
for i in range(4):
    weights[i] = class_weights[i]

In [77]:
clf1 = xgboost.XGBClassifier(use_label_encoder=False,n_estimators=700, max_depth=8, learning_rate=0.05))
clf2 = RandomForestClassifier(n_estimators=200, min_samples_split=5, bootstrap=False, class_weight=weights)
clf3 = lightgbm.LGBMClassifier(n_estimators=200, max_depth=8, learning_rate=0.1)
clf4 = ExtraTreesClassifier()
# clf5 = catboost.CatBoostClassifier(logging_level='Silent')

eclf1 = VotingClassifier(estimators=[
    ('xgb', clf1), ('rf', clf2), ('lgbm', clf3), ('extra_trees', clf4)], voting='hard')

eclf1.fit(X, Y)
y_pred = eclf1.predict(X_test)

In [79]:
y_pred.shape

(3411,)

In [80]:
ids = np.arange(y_pred.shape[0])

In [81]:
pd.DataFrame({"id": ids, "y": y_pred}).to_csv("sub_voting_class.csv", index=False)