In [1]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

sns.set()

%matplotlib inline

In [2]:
def reduce_mem_usage(df):
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in tqdm(df.columns):
        if df[col].dtype != object:  # Exclude strings

            # make variables for Int, max and min
            IsInt = False
            col_max_value = df[col].max()
            col_min_value = df[col].min()

            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(col_min_value - 1, inplace=True)

            # test if column can be converted to an integer
            col_as_int = df[col].fillna(0).astype(np.int64)
            diff = (df[col] - col_as_int)
            diff = diff.sum()
            if np.abs(diff) < 0.01:
                IsInt = True

            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if col_min_value >= 0:
                    if col_max_value < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif col_max_value < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif col_max_value < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if col_min_value > np.iinfo(np.int8).min and col_max_value < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif col_min_value > np.iinfo(np.int16).min and col_max_value < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif col_min_value > np.iinfo(np.int32).min and col_max_value < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif col_min_value > np.iinfo(np.int64).min and col_max_value < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    

            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)

    return df, NAlist

In [3]:
INPUT_DIR = '.'

train_transaction = pd.read_csv('train_transaction.csv')
train_identity = pd.read_csv('train_identity.csv')
test_transaction = pd.read_csv('test_transaction.csv')
test_identity = pd.read_csv('test_identity.csv')
sample_submission = pd.read_csv('sample_submission.csv')

df_train = train_transaction.merge(train_identity, how='left', on='TransactionID')
del train_transaction, train_identity
df_train, df_train_NAlist = reduce_mem_usage(df_train)

df_test = test_transaction.merge(test_identity, how='left', on='TransactionID')
del test_transaction, test_identity
df_test, df_test_NAlist = reduce_mem_usage(df_test)

100%|██████████| 434/434 [02:19<00:00,  3.11it/s]
100%|██████████| 433/433 [00:54<00:00,  7.98it/s]


В данных есть пропуски:

In [4]:
print('Missing data in train: {:.5f}%'.format(df_train.isnull().sum().sum() / (df_train.shape[0] * df_train.shape[1]) * 100))
print('Missing data in test: {:.5f}%'.format(df_test.isnull().sum().sum() / (df_test.shape[0] * df_test.shape[1]) * 100))

Missing data in train: 4.47002%
Missing data in test: 4.33051%


Заполним пропуски в столбцах, где значения выражаются числами - `-1`, а где строками - `'unseen_category'`.

In [5]:
for col in df_train.columns.drop('isFraud'):
    if df_train[col].dtype == 'O':
        df_train[col] = df_train[col].fillna('unseen_category')
        df_test[col] = df_test[col].fillna('unseen_category')
    else:
        df_train[col] = df_train[col].fillna(-1)
        df_test[col] = df_test[col].fillna(-1)

print('Missing data in train: {:.5f}%'.format(df_train.isnull().sum().sum() / (df_train.shape[0] * df_train.shape[1]) * 100))
print('Missing data in test: {:.5f}%'.format(df_test.isnull().sum().sum() / (df_test.shape[0] * df_test.shape[1]) * 100))

Missing data in train: 0.00000%
Missing data in test: 0.00000%


In [6]:
df_train.info(), df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417559 entries, 0 to 417558
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float32(80), int16(7), int8(9), object(31), uint16(40), uint32(3), uint8(264)
memory usage: 380.3+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 172981 entries, 0 to 172980
Columns: 433 entries, TransactionID to DeviceInfo
dtypes: float32(78), int16(6), int8(9), object(31), uint16(57), uint32(3), uint8(249)
memory usage: 159.0+ MB


(None, None)

Закодируем категориальные признаки с помощью [`LabelEncoder`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) и сконвертируем их в [`category`](https://pandas.pydata.org/pandas-docs/version/0.23.4/categorical.html).

In [7]:
for col in tqdm(df_train.columns.drop('isFraud')):
    if df_train[col].dtype == 'O':
        le = LabelEncoder()
        le.fit(list(df_train[col]) + list(df_test[col]))
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])
        
        df_train[col] = df_train[col].astype('category')
        df_test[col] = df_test[col].astype('category')

df_train.info(), df_test.info()

100%|██████████| 433/433 [00:13<00:00, 33.12it/s]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417559 entries, 0 to 417558
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: category(31), float32(80), int16(7), int8(9), uint16(40), uint32(3), uint8(264)
memory usage: 294.8 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 172981 entries, 0 to 172980
Columns: 433 entries, TransactionID to DeviceInfo
dtypes: category(31), float32(78), int16(6), int8(9), uint16(57), uint32(3), uint8(249)
memory usage: 123.6 MB





(None, None)

Перед тем, как обучать какие-либо модели, нужно настроить валидацию - для того, чтобы оценивать обобщающую способность без использования тестовой выборки. Несмотря на то, что модель можно применить к тестовой части и получить результат на лидерборде, отнюдь не факт, что такому результату можно доверять. Дело в том, что он рассчитывается по публичной части тестовой выборки - однако итоговый результат будет рассчитываться по приватной части после окончания соревнования. Распределение данных в публичной и приватной частях может различаться, и если вы будете оценивать качество только по результату на публичной части, может получиться так, что ваше итоговое место в соревновании будет сильно ниже того, что вы имели по ходу.

Однако бывают случаи, когда можно занять топовые места в соревновании и без настройки валидации, доверяя лишь результату на публичной части тестовой выборки. Пример: [описание решения победителя соревнования по детекции диабетической ретинопатии](https://www.kaggle.com/c/aptos2019-blindness-detection/discussion/108065).

Техники валидации могут быть очень разными - от разбиения на обучающую/валидационную часть до [разбиения на группы с перемешиванием](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupShuffleSplit.html#sklearn.model_selection.GroupShuffleSplit). Для кросс-валидации может быть полезно воспользоваться модулем [cross_validation](https://scikit-learn.org/stable/modules/cross_validation.html) из библиотеки `scikit-learn`.

Настройка корректной валидации очень зависит от поставленной задачи. Очевидно, в идеале нужно выбрать такой способ, который сможет отразить то, насколько хорошо модель справится с приватной частью тестовой выборки.

В ноутбуке с EDA было выдвинуто следующее предположение: согласно поведению признака `TransactionDT`, в обучающей части выборки содержатся данные за 4 месяца. Давайте настроим кросс-валидацию с 4 фолдами, где в каждом случае будем брать в качестве валидационной части тот или иной месяц.

In [8]:
(df_train['TransactionDT'].max() - df_train['TransactionDT'].min()) / (3600 * 24)

120.99996527777778

In [9]:
month_length = 3600 * 24 * 30
df_train['TransactionDT'].shape

(417559,)

In [10]:
fold0_idx = df_train[df_train['TransactionDT'] < df_train['TransactionDT'].min() + month_length].index
fold1_idx = df_train[(df_train['TransactionDT'].min() + month_length <= df_train['TransactionDT']) & (df_train['TransactionDT'] < df_train['TransactionDT'].min() + 2 * month_length)].index
fold2_idx = df_train[(df_train['TransactionDT'].min() + 2 * month_length <= df_train['TransactionDT']) & (df_train['TransactionDT'] < df_train['TransactionDT'].min() + 3 * month_length)].index
fold3_idx = df_train[df_train['TransactionDT'].min() + 3 * month_length <= df_train['TransactionDT']].index
print('Validation set 0 length:', len(fold0_idx))
print('Validation set 1 length:', len(fold1_idx))
print('Validation set 2 length:', len(fold2_idx))
print('Validation set 3 length:', len(fold3_idx))

Validation set 0 length: 134339
Validation set 1 length: 89399
Validation set 2 length: 92189
Validation set 3 length: 101632


In [11]:
folds_idx = [fold0_idx, fold1_idx, fold2_idx, fold3_idx]

In [12]:
pd.unique(df_train['ProductCD'])

[4, 1, 0, 3, 2]
Categories (5, int64): [4, 1, 0, 3, 2]

В данных есть признак-идентификатор объекта - `'TransactionID'`. Заметим, что его значения в обучающей и тестовых выборках не пересекаются:

In [13]:
set(df_train['TransactionID']).intersection(set(df_test['TransactionID']))

set()

Также не пересекаются значения признака, отвечающего за момент времени - `'TransactionDT'`:

In [14]:
set(df_train['TransactionDT']).intersection(set(df_test['TransactionDT']))

set()

В связи с этим удалим эти признаки, чтобы модель их не учитывала.

In [15]:
df_train.drop(['TransactionID', 'TransactionDT'], axis=1, inplace=True)
df_test.drop(['TransactionID', 'TransactionDT'], axis=1, inplace=True)
df_train.shape, df_test.shape

((417559, 432), (172981, 431))

Обучимся с помощью [`lightgbm`](https://lightgbm.readthedocs.io/en/latest/), и для каждой модели сделаем предсказание на тестовой выборке. Также будем сохранять важности признаков на каждом фолде.


In [111]:
%%time

params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 2**8,
    'max_bin': 255,
    'max_depth': -1,
    'bagging_freq': 5,
    'bagging_fraction': 0.7,
    'feature_fraction': 0.7,
    'first_metric_only': True,
    'verbose': 100,
    'n_jobs': -1}

scores = []

feature_importances = pd.DataFrame()
feature_importances['feature'] = df_train.columns.drop('isFraud')

test_preds = []
for i in range(len(folds_idx)):
    X_train = df_train.drop(folds_idx[i], axis=0)
    y_train = X_train['isFraud'].values
    X_val = df_train.iloc[folds_idx[i]]
    y_val = X_val['isFraud'].values
    X_train = X_train.drop('isFraud', axis=1)
    X_val = X_val.drop('isFraud', axis=1)

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
    lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_eval, verbose_eval=100)
    
    feature_importances['fold_{}'.format(i)] = lgb_model.feature_importance()

    y_pred = lgb_model.predict(X_val)
    score_fold = roc_auc_score(y_val, y_pred)
    scores.append(score_fold)
    y_test_pred = lgb_model.predict(df_test)
    test_preds.append(y_test_pred)
    

for i in range(len(scores)):
    print('Fold {}, AUC-ROC: {:.5f}'.format(i, scores[i]))
print('CV AUC-ROC: {:.5f}'.format(np.mean(scores)))

[LightGBM] [Info] Number of positive: 11320, number of negative: 271900
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.859442
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.764757
[LightGBM] [Debug] init for col-wise cost 0.285141 seconds, init for row-wise cost 0.316950 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 31468
[LightGBM] [Info] Number of data points in the train set: 283220, number of used features: 429
[LightGBM] [Debug] Use subset for bagging




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039969 -> initscore=-3.178863
[LightGBM] [Info] Start training from score -3.178863
[LightGBM] [Debug] Re-bagging, using 198290 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 198503 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 198

[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 23
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 23
[100]	valid_0's auc: 0.869001
[LightGBM] [Info] Number of positive: 11144, number of negative: 317016
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.798186
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.683473
[LightGBM] [Debug] init for col-wise cost 0.298739 seconds, init for row-wise cost 0.601440 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 34278
[LightGBM] [Info] Number of data points in the train set: 328160, number of used features: 429
[LightGBM] [Debug] Use subset for bagging




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033959 -> initscore=-3.348051
[LightGBM] [Info] Start training from score -3.348051
[LightGBM] [Debug] Re-bagging, using 229717 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 17
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 18
[LightGBM] [Debug] Re-bagging, using 229724 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 17
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Re-bagging, using 229

[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[100]	valid_0's auc: 0.892472
[LightGBM] [Info] Number of positive: 10997, number of negative: 314373
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.798412
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.683394
[LightGBM] [Debug] init for col-wise cost 0.393850 seconds, init for row-wise cost 0.521597 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 34304
[LightGBM] [Info] Number of data points in the train set: 325370, number of used features: 429
[LightGBM] [Debug] Use subset for bagging




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033798 -> initscore=-3.352958
[LightGBM] [Info] Start training from score -3.352958
[LightGBM] [Debug] Re-bagging, using 227769 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 19
[LightGBM] [Debug] Re-bagging, using 227759 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 17
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[LightGBM] [Debug] Re-bagging, using 228

[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 22
[100]	valid_0's auc: 0.906273
[LightGBM] [Info] Number of positive: 10702, number of negative: 305225
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.794512
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.679682
[LightGBM] [Debug] init for col-wise cost 0.357781 seconds, init for row-wise cost 0.890425 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 34373
[LightGBM] [Info] Number of data points in the train set: 315927, number of used features: 429
[LightGBM] [Debug] Use subset for bagging




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.033875 -> initscore=-3.350619
[LightGBM] [Info] Start training from score -3.350619
[LightGBM] [Debug] Re-bagging, using 221173 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 25
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 19
[LightGBM] [Debug] Re-bagging, using 221209 data to train
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 24
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 19
[LightGBM] [Debug] Re-bagging, using 221

[LightGBM] [Debug] Trained a tree with leaves = 256 and max_depth = 20
[100]	valid_0's auc: 0.889044
Fold 0, AUC-ROC: 0.86900
Fold 1, AUC-ROC: 0.89247
Fold 2, AUC-ROC: 0.90627
Fold 3, AUC-ROC: 0.88904
CV AUC-ROC: 0.88920
CPU times: user 9min 58s, sys: 26.1 s, total: 10min 24s
Wall time: 2min 3s


In [113]:
test_preds

[array([0.11373135, 0.12434962, 0.27214846, ..., 0.11739914, 0.12829628,
        0.14941736]),
 array([0.1118137 , 0.10021471, 0.25503352, ..., 0.07368692, 0.09892523,
        0.08978063]),
 array([0.092617  , 0.0765542 , 0.19079323, ..., 0.06326261, 0.06008122,
        0.07328543]),
 array([0.23238787, 0.12270289, 0.33467485, ..., 0.10666832, 0.10565928,
        0.13919368])]

In [83]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, y_train, enable_categorical = True)
dtest = xgb.DMatrix(X_val, y_val, enable_categorical = True)
# specify parameters via map
param = {'max_depth':12, 'eval_metric':'auc', 'learning_rate':0.02, 'n_estimators':5000}
num_round = 2
bst = xgb.train(param, dtrain, num_round)

Parameters: { missing, n_estimators, subsample=0.8 } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [114]:
# make prediction
xgb_preds = bst.predict(xgb.DMatrix(df_test, enable_categorical = True))

Для получения итогового предсказания на тестовой выборке усредним предсказания моделей с разных фолдов.

In [115]:
np.array(test_preds).shape

(4, 172981)

In [116]:
np.vstack([np.array(test_preds), xgb_preds]).shape

(5, 172981)

In [131]:
final_pred = 1*np.average(test_preds, axis=0)+0*xgb_preds
final_pred

array([0.13763748, 0.10595536, 0.26316252, ..., 0.09025425, 0.0982405 ,
       0.11291928])

In [123]:
sub = pd.DataFrame({'TransactionID': sample_submission['TransactionID'], 'isFraud': final_pred})
sub.head()

Unnamed: 0,TransactionID,isFraud
0,3404559,0.149195
1,3404560,0.117672
2,3404561,0.270899
3,3404562,0.184097
4,3404563,0.126944


In [124]:
sub.tail()

Unnamed: 0,TransactionID,isFraud
172976,3577535,0.103903
172977,3577536,0.109705
172978,3577537,0.102488
172979,3577538,0.109888
172980,3577539,0.124211


Сохраняем файл с предсказаниями - теперь его можно отправить в соревнование и посмотреть результат на публичной части.

In [125]:
sub.to_csv('submission_xgb4.csv', index=False)

Наконец, построим распределение предсказаний для целевой переменной на тестовой выборке.

In [None]:
plt.figure(figsize=(11, 8))
plt.hist(sub['isFraud'], bins=30)
plt.title('Distribution of isFraud prediction on test')
plt.show()

In [61]:
X_val.select_dtypes(include=['category']).columns

Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo'],
      dtype='object')

In [60]:
X_train.select_dtypes(include=['category']).columns

Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo'],
      dtype='object')

In [43]:
cat_preds = model.predict(eval_data)

In [44]:
cat_preds.shape

(92189,)

In [46]:
roc_auc_score(np.array(y_val), cat_preds)

0.526616968437031