In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import shap
import gc
from random import shuffle
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import xgboost
xgboost.__version__

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
columns = test.columns[1:]
columns

In [None]:
target = np.hstack([np.ones(train.shape[0]), np.zeros(test.shape[0])])

In [None]:
train_test = np.vstack([train[columns].values, test[columns].values])

In [None]:
index = list(range(train_test.shape[0]))
shuffle(index)
train_test = train_test[index, :]
target = target[index]
train_test = train_test.astype(np.float)

In [None]:
train, test, y_train, y_test = train_test_split(train_test, target, test_size=0.33, random_state=42)
train = xgboost.DMatrix(train, label=y_train)
val = xgboost.DMatrix(test, label=y_test)

In [None]:
%%time
param = {
    'eta': 0.02,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist', 
    'predictor': 'cpu_predictor'
}
clf = xgboost.train(param, train, 2500)

In [None]:
preds = clf.predict(val)
roc_auc_score(y_test, preds)


This is a rather significant AUC. It indicates that there is a significant shift in many of the variables. Let us look what variables are the most distinct between the train and test sets.

In [None]:
%%time
shap_preds = clf.predict(val, pred_contribs=True) 

In [None]:
shap.summary_plot(shap_preds[:,:-1], pd.DataFrame(test, columns=columns))

Looks like Sensor 3 data is the most different between the train and test sets. 