In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('max_columns',None)
pd.set_option('display.max_rows', 1000)

# Introduction

This notebook tests the dataset with over 2500 features on a simple xgboost model.

# Loading data

In [None]:
%%time

X = pd.read_csv('../input/tps042022-2500-features/tps042022_train.csv')
X_test= pd.read_csv('../input/tps042022-2500-features/tps042022_test.csv')
labels=pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
X

In [None]:
X_test

In [None]:
assert np.all(X.sequence.to_numpy() == labels.sequence.to_numpy())

In [None]:
y = labels.state

# Group k-fold Split

In [None]:
from sklearn.model_selection import GroupKFold

splits = []

for train_index, test_index in GroupKFold(5).split(X, y, X.subject):
    splits.append((train_index,test_index))

# Training

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb.__version__

We use some reasonable but untuned hyperparameters.

In [None]:
params = {
          "colsample_bytree": 0.5,
          'learning_rate': 0.05
         }
params['verbosity'] = 2
params['tree_method'] = 'gpu_hist'
params['predictor'] = 'gpu_predictor'
params['sampling_method'] = 'gradient_based'
params['n_jobs'] = -1
params['random_state']=42
params['n_estimators'] = 50000

In [None]:
from sklearn.metrics import roc_auc_score

importance_scores = []
aucs = []
test_preds = []

for i, split in enumerate(splits):
    train_index, test_index = split
    X_t = X.iloc[train_index].drop(['sequence','subject'],axis=1)
    y_t = y.iloc[train_index]
    X_v = X.iloc[test_index].drop(['sequence','subject'],axis=1)
    y_v = y.iloc[test_index]

    clf = XGBClassifier(**params, use_label_encoder=False)
    print(F'Split {i}:')
    clf.fit(X_t,y_t,
            eval_set=[(X_t,y_t),(X_v,y_v)],eval_metric=['logloss','error','auc'], 
            early_stopping_rounds=200,verbose=1000)
    importance_scores.append(pd.Series(clf.get_booster().get_score(importance_type='total_gain')))
    aucs.append(roc_auc_score(y_v,clf.predict_proba(X_v)[:,1]))
    test_preds.append(clf.predict_proba(X_test.drop(['sequence','subject'],axis=1))[:,1])

AUC score averaged over the folds:

In [None]:
print(F'{np.mean(aucs)}'+u' \u00B1 '+F'{np.std(aucs)}')

Top 100 features ranked by "total gain" from xgboost:

In [None]:
fscores = importance_scores[0]
for i in range(1,len(importance_scores)):
    fscores += importance_scores[i]
    
fscores /= len(importance_scores)

N = 100
ax = fscores.sort_values(ascending=False).iloc[:100].plot(kind='barh',figsize=(18,N//50*12))
ax.invert_yaxis()

In [None]:
pd.DataFrame({'sequence': X_test.sequence, 'state': np.mean(np.array(test_preds),axis=0)}).to_csv('submission.csv', index=False)
print("Submission saved!")