In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [3]:
import cudf, cuml, cupy
from cudf.core.dataframe import DataFrame as cu_df
from cudf.core.series import Series as cu_series



In [4]:
import numpy as np
import pandas as pd
import os
from functools import partial
import scipy as sp
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import random

In [5]:
SEED = 29
NUM_FOLDS = 10

In [6]:
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed=SEED)

In [7]:
train = pd.read_csv('/content/drive/MyDrive/Colab/2304_PGS3E7/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab/2304_PGS3E7/test.csv')
sample = pd.read_csv('/content/drive/MyDrive/Colab/2304_PGS3E7/sample_submission.csv')

In [None]:
train.describe()

Unnamed: 0,id,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
count,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0,42100.0
mean,21049.5,1.920713,0.141093,0.884632,2.398005,0.239192,0.025249,0.428931,103.888029,2017.856295,7.593539,15.902945,0.728504,0.029192,0.019715,0.175772,104.566377,0.571734,0.392019
std,12153.367503,0.52495,0.450128,0.885693,1.42733,0.587674,0.156884,0.832605,81.069343,0.350795,2.829395,8.888582,0.633529,0.168347,0.325837,1.732121,37.139165,0.775041,0.488207
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10524.75,2.0,0.0,0.0,1.0,0.0,0.0,0.0,37.0,2018.0,6.0,8.0,0.0,0.0,0.0,0.0,80.0,0.0,0.0
50%,21049.5,2.0,0.0,1.0,2.0,0.0,0.0,0.0,93.0,2018.0,8.0,16.0,1.0,0.0,0.0,0.0,99.45,0.0,0.0
75%,31574.25,2.0,0.0,2.0,3.0,0.0,0.0,1.0,155.0,2018.0,10.0,24.0,1.0,0.0,0.0,0.0,123.3,1.0,1.0
max,42099.0,4.0,9.0,7.0,17.0,3.0,1.0,6.0,443.0,2018.0,12.0,31.0,4.0,1.0,13.0,58.0,540.0,5.0,1.0


In [8]:
features = train.drop(columns=['booking_status', 'id']).columns
#Duplicates in train set
train_dup = train.duplicated(subset = features, keep = False)
train_dup_idx = [i for i, x in enumerate(train_dup) if x]
print('Duplicates in train set: ', len(train_dup_idx))
#Duplicates in test set
test_dup = test.duplicated(subset = features, keep = False)
test_dup_idx = [i for i, x in enumerate(test_dup) if x]
print('Duplicates in test set: ', len(test_dup_idx))
#Data that exists in both train and test set
train_drop_dup = train.drop(train_dup_idx, axis=0).reset_index(drop=True)
test_drop_dup = test.drop(test_dup_idx, axis=0)
traintest_dup = pd.concat([train_drop_dup, test_drop_dup]).duplicated(subset = features, keep = False)
Traintest_dup_idx = [i for i, x in enumerate(traintest_dup[0:len(train_drop_dup)]) if x]
trainTest_dup_idx = [i for i, x in enumerate(traintest_dup[len(train_drop_dup):]) if x]
print('Duplicates in both set: ', traintest_dup.sum())

Duplicates in train set:  1124
Duplicates in test set:  506
Duplicates in both set:  1432


In [9]:
X = train_drop_dup.drop(columns=['booking_status', 'id'])
features = X.columns
X_test = test.drop(columns=['id'])
y = train_drop_dup.booking_status

In [10]:
def cross_valid(model, train, target, test, num_folds=10, random_state=29):

    train_oof = np.zeros((len(train)))
    test_preds = 0

    kf = StratifiedKFold(n_splits=num_folds, random_state=SEED, shuffle=True)
    scores = []
    
    params = {
        'random_state': SEED,
        'nthread': -1,
        'objective': 'binary:logistic',
        
        'lambda': 1.8953422656880643, 'alpha': 0.6425917192776625, 'max_depth': 24, 'eta': 0.05032169207297668, 'gamma': 5.119269492739059, 'min_child_weight': 6, 'subsample': 0.8191225766601575, 'colsample_bytree': 0.35823009521045057, 'max_delta_step': 42.72481438766471, 'grow_policy': 'lossguide'
     }
    
    num_rounds = 2000
    
    # xgb_train_preds = np.zeros(len(train.index), )
    

    for f, (train_ind, val_ind) in enumerate(kf.split(train, target)):

        train_df, val_df = train.iloc[train_ind], train.iloc[val_ind]
        
        train_target, val_target = target[train_ind], target[val_ind]
        

        xgb_x_train = pd.DataFrame(train_df)
        xgb_x_valid = pd.DataFrame(val_df)

        xgb_x_train_cudf = cu_df(xgb_x_train)
        y_train_cudf = cu_series(train_target)
        xgb_x_valid_cudf = cu_df(xgb_x_valid)
        y_valid_cudf = cu_series(val_target)

        trn_data = xgb.DMatrix(xgb_x_train_cudf, label=y_train_cudf)
        val_data = xgb.DMatrix(xgb_x_valid_cudf, label=y_valid_cudf)


        model = xgb.train(params, 
        trn_data,
        num_rounds,
        evals = [(val_data, "val_data")], 
        verbose_eval=False, 
        early_stopping_rounds=50
        )
        

        temp_oof = model.predict(xgb.DMatrix(xgb_x_valid_cudf), iteration_range = (0, model.best_iteration+1))
        
        train_oof[val_ind] = temp_oof
    
        test_oof_preds = model.predict(xgb.DMatrix(test), iteration_range = (0, model.best_iteration+1))
        
        test_preds += test_oof_preds/num_folds
        
        scores.append(roc_auc_score(val_target, temp_oof))
        
        print("Fold " , f, " ", roc_auc_score(val_target, temp_oof))
        
                                
    print("Mean Kappa Score: ", np.mean(scores))
    print("Kappa Score OOF: ", roc_auc_score(y, train_oof))

    return train_oof, test_preds, np.mean(scores)

In [11]:
## Here I can select which features I want to use
columns = features

## If you want to do ensemble is good to save the oof train / preds
train_oof_1, test_preds_1, score_oof_1 = cross_valid(None, X, y, X_test, num_folds=NUM_FOLDS, random_state=SEED)

Fold  0   0.9055977186814773
Fold  1   0.9054829712860928
Fold  2   0.9100490657858343
Fold  3   0.918859811754142
Fold  4   0.9195593500964986
Fold  5   0.919043354870707
Fold  6   0.909837429714205
Fold  7   0.9168846819199645
Fold  8   0.9100048574082493
Fold  9   0.9116575792862869
Mean Kappa Score:  0.9126976820803459
Kappa Score OOF:  0.912688901167427


In [12]:
traintest_dup_group = pd.concat([train_drop_dup.drop(columns=['booking_status', 'id']), test_drop_dup.drop(columns=['id'])])
traintest_dup_group = traintest_dup_group[traintest_dup_group.duplicated(keep=False)]
traintest_dup_group = traintest_dup_group.groupby(list(traintest_dup_group)).apply(lambda x: tuple(x.index)).to_list()

In [13]:
rev_train = train_drop_dup['booking_status'][Traintest_dup_idx].apply(lambda x: 0 if x == 1 else 1).reset_index()
rev_train

Unnamed: 0,index,booking_status
0,88,1
1,194,1
2,228,0
3,277,0
4,333,0
...,...,...
711,40661,0
712,40724,0
713,40728,0
714,40797,1


In [14]:
sample['booking_status'] = test_preds_1
sample['booking_status'][test_dup_idx] = 0.5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['booking_status'][test_dup_idx] = 0.5


In [15]:
for i in range(len(rev_train)):
 for j in range(len(rev_train)):
   if traintest_dup_group[i][0] == rev_train['index'][j]:
     sample['booking_status'][traintest_dup_group[i][1]] = rev_train['booking_status'][j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['booking_status'][traintest_dup_group[i][1]] = rev_train['booking_status'][j]


In [16]:
sample.to_csv('/content/drive/MyDrive/Colab/2304_PGS3E7/submission.csv', index=False)
sample

Unnamed: 0,id,booking_status
0,42100,0.131403
1,42101,0.066645
2,42102,0.293274
3,42103,0.032227
4,42104,0.397114
...,...,...
28063,70163,0.500000
28064,70164,0.027254
28065,70165,0.063822
28066,70166,0.513025


# Xgb params tuning

In [None]:
!pip -q install optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import optuna
from sklearn.model_selection import train_test_split

def objective(trial, train=X, target=y):

    params = {
        'random_state': SEED,
        'nthread': -1,
        'objective': 'binary:logistic', 

        'lambda': trial.suggest_float('lambda', 1e-8, 10),
        'alpha': trial.suggest_float('alpha', 1e-8, 10),
        'max_depth': trial.suggest_int('max_depth',2, 50),
        'eta': trial.suggest_float("eta", 1e-8, 1.0),
        'gamma': trial.suggest_float("gamma", 1e-8, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        "subsample": trial.suggest_float("subsample", 0, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        'max_delta_step': trial.suggest_float("max_delta_step", 0, 100),
        'grow_policy': trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
    }
    num_rounds = 2000

    X_train, X_valid, y_train, y_valid = train_test_split(train, target, test_size=0.4)

    xgb_x_train_cudf = cu_df(X_train)
    y_train_cudf = cu_series(y_train)
    xgb_x_valid_cudf = cu_df(X_valid)
    y_valid_cudf = cu_series(y_valid)

    trn_data = xgb.DMatrix(xgb_x_train_cudf, label=y_train_cudf)
    val_data = xgb.DMatrix(xgb_x_valid_cudf, label=y_valid_cudf)

    model = xgb.train(params, 
    trn_data,
    num_rounds,
    evals = [(val_data, "val_data")], 
    verbose_eval=False, 
    early_stopping_rounds=50
    )

    xgb_valid_preds = model.predict(xgb.DMatrix(xgb_x_valid_cudf), iteration_range = (0, model.best_iteration+1))
        
    return roc_auc_score(y_valid, xgb_valid_preds)

# Creating Optuna object and defining its parameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 200)

# Showing optimization results
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

[32m[I 2023-03-31 03:45:49,101][0m A new study created in memory with name: no-name-c4d99463-b0eb-496f-ae95-f43c0a7341d2[0m
[32m[I 2023-03-31 03:46:22,466][0m Trial 0 finished with value: 0.8897919415378156 and parameters: {'lambda': 1.2802116160272614, 'alpha': 0.790413930838317, 'max_depth': 21, 'eta': 0.24725987887310985, 'gamma': 3.8324340507402437, 'min_child_weight': 47, 'subsample': 0.6898644657358111, 'colsample_bytree': 0.9100956492957788, 'max_delta_step': 36.16636898881498, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.8897919415378156.[0m
[32m[I 2023-03-31 03:46:24,415][0m Trial 1 finished with value: 0.8918997834563578 and parameters: {'lambda': 2.2503900505987104, 'alpha': 1.9498891170885424, 'max_depth': 15, 'eta': 0.7260191477868199, 'gamma': 5.811473891224876, 'min_child_weight': 2, 'subsample': 0.6055154688165508, 'colsample_bytree': 0.2744831419685871, 'max_delta_step': 81.03514998520917, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0

Number of finished trials: 200
Best trial parameters: {'lambda': 8.977457090157136, 'alpha': 6.769029687875978, 'max_depth': 45, 'eta': 0.05961575009750307, 'gamma': 2.456975526632567, 'min_child_weight': 7, 'subsample': 0.48846255138108274, 'colsample_bytree': 0.9118497187367811, 'max_delta_step': 82.1488503572922, 'grow_policy': 'lossguide'}
Best score: 0.9013223986343586


In [None]:
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

In [None]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)