In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv', index_col = 0)
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv', index_col = 0)
for df in [train, test]:
    for i in range(10):
        df[f'ch{i}'] = df.f_27.str.get(i).apply(ord) - ord('A')
    # Next feature is from https://www.kaggle.com/code/cabaxiom/tps-may-22-eda-lgbm-model
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
features = [f for f in test.columns if f != 'id' and f != 'f_27']
test[features].head(2)

Unnamed: 0_level_0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,ch1,ch2,ch3,ch4,ch5,ch6,ch7,ch8,ch9,unique_characters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
900000,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,0,...,0,0,0,1,0,3,11,0,2,5
900001,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,4,...,5,0,1,1,0,4,6,2,1,6


In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)  
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
reduce_mem_usage(train)
reduce_mem_usage(test)

Mem. usage decreased to 81.86 Mb (72.9% reduction)
Mem. usage decreased to 66.75 Mb (70.9% reduction)


Unnamed: 0_level_0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,ch1,ch2,ch3,ch4,ch5,ch6,ch7,ch8,ch9,unique_characters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
900000,0.442627,0.174438,-1.000000,0.762695,0.186768,-1.075195,0.501953,6,6,0,...,0,0,0,1,0,3,11,0,2,5
900001,-0.605469,-0.305664,0.627441,-0.579102,-1.750977,1.355469,-0.190918,1,3,4,...,5,0,1,1,0,4,6,2,1,6
900002,0.303955,2.445312,0.246460,0.818359,0.359619,-1.332031,1.358398,3,3,4,...,1,0,2,0,1,1,10,4,4,5
900003,0.154053,0.260010,-1.367188,-0.093201,-1.111328,-0.948242,1.119141,0,0,4,...,4,1,4,0,0,2,16,2,2,5
900004,-1.652344,-0.424316,-0.667480,-0.322021,-0.089478,0.181763,1.785156,2,2,2,...,4,1,1,1,1,3,0,1,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,0.640137,0.897949,-0.523926,1.563477,-0.092285,-0.610840,0.535645,0,1,6,...,2,1,2,4,1,7,12,2,3,6
1599996,-0.191772,-0.035248,-0.118530,0.584961,2.126953,0.568848,-0.052673,4,3,4,...,0,0,1,2,0,3,16,5,2,6
1599997,-0.331787,-0.328857,-1.185547,1.022461,-0.483154,-0.107117,-0.968262,1,1,2,...,0,0,9,2,1,6,16,1,0,6
1599998,-2.031250,-1.238281,0.964844,-1.045898,0.906250,0.634277,-0.707520,5,1,1,...,2,1,1,2,0,1,13,3,4,6


In [5]:
train

Unnamed: 0_level_0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,ch1,ch2,ch3,ch4,ch5,ch6,ch7,ch8,ch9,unique_characters
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.373047,0.238892,-0.243408,0.567383,-0.647949,0.839355,0.113159,1,5,1,...,1,0,1,3,0,3,1,0,1,3
1,1.697266,-1.709961,-2.230469,-0.545898,1.113281,-1.551758,0.447754,1,3,4,...,2,0,2,2,0,3,2,4,1,5
2,1.681641,0.616699,-1.027344,0.810547,-0.608887,0.113953,-0.708496,1,0,2,...,0,0,4,0,1,2,10,0,3,6
3,-0.118164,-0.587891,-0.804688,2.085938,0.371094,-0.128784,-0.282471,3,2,1,...,3,1,1,0,0,2,1,2,1,4
4,1.148438,-0.176514,-0.665039,-1.101562,0.467773,0.500000,0.407471,3,3,0,...,3,1,2,1,1,2,7,5,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,1.379883,-0.038879,0.597168,0.854492,0.684082,-1.058594,1.310547,2,1,2,...,0,1,1,2,1,1,1,4,3,5
899996,-1.370117,0.044830,0.015457,0.376465,-0.380615,-0.831055,-1.798828,4,1,2,...,1,1,6,1,1,3,16,1,4,5
899997,1.385742,-0.960938,0.726074,-0.132812,0.874023,-0.245361,-1.045898,0,0,6,...,4,1,4,3,1,1,7,1,0,5
899998,-1.590820,-0.509766,-1.715820,-0.250000,1.360352,1.650391,-0.058594,0,2,2,...,3,1,0,0,0,3,3,0,4,4


In [6]:
params = {'tree_method':'gpu_hist',
          'n_estimators': 10000,
          'lambda': 0.001583005792255653, 
          'alpha': 9.826088526413647, 
          'colsample_bytree': 0.5, 
          'subsample': 0.5, 
          'learning_rate': 0.02, 
          'max_depth': 11, 
          'random_state': 2020, 
          'min_child_weight': 135}

In [7]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=48,shuffle=True)
auc=[]  # list contains roc_auc score for each fold
n=0
for trn_idx, test_idx in kf.split(train[features],train['target']):
    X_tr,X_val=train[features].iloc[trn_idx],train[features].iloc[test_idx]
    y_tr,y_val=train['target'].iloc[trn_idx],train['target'].iloc[test_idx]
    model = XGBRegressor(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[features])/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict(X_val)))
    print(f"fold: {n+1} ==> rmse: {auc[n]}")
    n+=1



fold: 1 ==> rmse: 0.9839812288447011




fold: 2 ==> rmse: 0.9843010532187587




fold: 3 ==> rmse: 0.9846171657249944




fold: 4 ==> rmse: 0.9840543467710807




fold: 5 ==> rmse: 0.9845673550582186


In [8]:
sub = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
sub['target'] = preds
sub.to_csv('sub_tpsmay22_xgbregressor_v1.csv', index = False)