In [1]:
%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


/kaggle/input/playground-series-s5e2/sample_submission.csv
/kaggle/input/playground-series-s5e2/train.csv
/kaggle/input/playground-series-s5e2/test.csv
/kaggle/input/playground-series-s5e2/training_extra.csv


In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e2/train.csv')
train_extra = pd.read_csv('/kaggle/input/playground-series-s5e2/training_extra.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e2/test.csv')

In [3]:
train = pd.concat([train, train_extra], axis=0, ignore_index=True)

## Data Understanding

In [4]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 11 columns):
 #   Column                Dtype
---  ------                -----
 0   id                    int64
 1   Brand                 object
 2   Material              object
 3   Size                  object
 4   Compartments          float64
 5   Laptop Compartment    object
 6   Waterproof            object
 7   Style                 object
 8   Color                 object
 9   Weight Capacity (kg)  float64
 10  Price                 float64
dtypes: float64(3), int64(1), object(7)
memory usage: 362.7+ MB


In [5]:
train.describe()

Unnamed: 0,id,Compartments,Weight Capacity (kg),Price
count,3994318.0,3994318.0,3992510.0,3994318.0
mean,2182137.0,5.43474,18.01042,81.36217
std,1178058.0,2.893043,6.973969,38.93868
min,0.0,1.0,5.0,15.0
25%,1198579.0,3.0,12.06896,47.47002
50%,2197158.0,5.0,18.05436,80.98495
75%,3195738.0,8.0,23.9875,114.855
max,4194317.0,10.0,30.0,150.0


In [6]:
print('Any missing for train df')
print(train.isnull().sum())

print(f'\nAny missing for test df')
print(test.isnull().sum())

Any missing for train df
id                           0
Brand                   126758
Material                110962
Size                     87785
Compartments                 0
Laptop Compartment       98533
Waterproof               94324
Style                   104180
Color                   133617
Weight Capacity (kg)      1808
Price                        0
dtype: int64

Any missing for test df
id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64


In [7]:
train.sample(10)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
2872222,3072222,Puma,Canvas,Large,3.0,Yes,No,Messenger,Black,8.901601,77.03475
3343621,3543621,Jansport,Nylon,Small,3.0,No,No,Messenger,Red,26.822771,103.6933
3672018,3872018,Adidas,Polyester,,8.0,No,No,Tote,Pink,8.547665,16.31408
1872779,2072779,Adidas,Canvas,Large,1.0,Yes,Yes,Tote,Black,18.924587,118.01957
2414499,2614499,Under Armour,Polyester,Large,1.0,No,Yes,Messenger,Gray,28.075813,134.22741
3417038,3617038,Puma,,Large,6.0,Yes,No,Messenger,Black,24.203744,65.73514
3366859,3566859,Under Armour,Leather,Small,5.0,Yes,Yes,Backpack,Green,18.039189,105.69786
2204959,2404959,Nike,Canvas,Medium,6.0,Yes,No,Messenger,Gray,9.26744,48.43761
1652492,1852492,Adidas,Leather,Medium,5.0,No,No,Messenger,Blue,8.077984,112.20473
1679935,1879935,Nike,Canvas,Medium,9.0,Yes,No,Messenger,Gray,17.928655,97.64603


## Data Preparation

In [8]:
train = train.rename(columns={'Laptop Compartment': 'Laptop_Compartment',
                     'Weight Capacity (kg)': 'Weight_Capacity'})
test = test.rename(columns={'Laptop Compartment': 'Laptop_Compartment',
                     'Weight Capacity (kg)': 'Weight_Capacity'})

In [9]:
from sklearn.impute import SimpleImputer

numerical_cols = test.select_dtypes(include=['float64']).columns
impute = SimpleImputer(strategy='median')
train[numerical_cols] = impute.fit_transform(train[numerical_cols])
test[numerical_cols] = impute.fit_transform(test[numerical_cols])

In [10]:
cat_cols = train.select_dtypes(include=['object']).columns
train[cat_cols] = train[cat_cols].fillna('unknown')
test[cat_cols] = test[cat_cols].fillna('unknown')

## Feature Engineering

In [11]:
for i, c in enumerate(cat_cols):
    combine = pd.concat([train[c], test[c]], axis=0)
    combine, _ = pd.factorize(combine)
    train[c] = combine[:len(train)].astype('float32')
    test[c] = combine[len(train):].astype('float32')
    n = f'{c}_wc'
    train[n] = (train[c] * 100 + train['Weight_Capacity']).astype('float32')
    test[n] = (test[c] * 100 + test['Weight_Capacity']).astype('float32')

# Checking to make sure that the data type for newly created features are float32

In [12]:
train.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 18 columns):
 #   Column                 Dtype
---  ------                 -----
 0   id                     int64
 1   Brand                  float32
 2   Material               float32
 3   Size                   float32
 4   Compartments           float64
 5   Laptop_Compartment     float32
 6   Waterproof             float32
 7   Style                  float32
 8   Color                  float32
 9   Weight_Capacity        float64
 10  Price                  float64
 11  Brand_wc               float32
 12  Material_wc            float32
 13  Size_wc                float32
 14  Laptop_Compartment_wc  float32
 15  Waterproof_wc          float32
 16  Style_wc               float32
 17  Color_wc               float32
dtypes: float32(14), float64(3), int64(1)
memory usage: 335.2 MB


In [13]:
test.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype
---  ------                 --------------   -----
 0   id                     200000 non-null  int64
 1   Brand                  200000 non-null  float32
 2   Material               200000 non-null  float32
 3   Size                   200000 non-null  float32
 4   Compartments           200000 non-null  float64
 5   Laptop_Compartment     200000 non-null  float32
 6   Waterproof             200000 non-null  float32
 7   Style                  200000 non-null  float32
 8   Color                  200000 non-null  float32
 9   Weight_Capacity        200000 non-null  float64
 10  Brand_wc               200000 non-null  float32
 11  Material_wc            200000 non-null  float32
 12  Size_wc                200000 non-null  float32
 13  Laptop_Compartment_wc  200000 non-null  float32
 14  Waterproof_wc          200000 non-null  

In [14]:
test.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop_Compartment,Waterproof,Style,Color,Weight_Capacity,Brand_wc,Material_wc,Size_wc,Laptop_Compartment_wc,Waterproof_wc,Style_wc,Color_wc
0,300000,4.0,0.0,1.0,2.0,1.0,0.0,0.0,1.0,20.671147,420.671143,20.671146,120.67115,120.67115,20.671146,20.671146,120.67115
1,300001,2.0,1.0,0.0,7.0,1.0,1.0,3.0,1.0,13.564105,213.564102,113.564102,13.564105,113.564102,113.564102,313.564117,113.564102
2,300002,3.0,1.0,2.0,9.0,1.0,1.0,1.0,3.0,11.809799,311.809784,111.809799,211.809799,111.809799,111.809799,111.809799,311.809784
3,300003,3.0,2.0,2.0,1.0,0.0,0.0,1.0,1.0,18.477036,318.477051,218.477036,218.477036,18.477036,18.477036,118.477036,118.477036
4,300004,5.0,2.0,2.0,2.0,0.0,1.0,0.0,0.0,9.907953,509.907959,209.907959,209.907959,9.907953,109.907951,9.907953,9.907953


In [15]:
from sklearn.model_selection import KFold, cross_val_score
from cuml.metrics import mean_squared_error

X = train.copy()
y = X.pop('Price')
X_test = test.copy()

n_folds=8

kf = KFold(n_splits=n_folds, shuffle=True, random_state=600)

for train_index, valid_index in kf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

## Target encoding with quantile

In [16]:
train_with_target = X_train.copy()
train_with_target['Price'] = y_train

quants = [5, 10, 25, 40, 50, 60, 75, 95]

for q in quants:
    result = train_with_target.groupby('Weight_Capacity')['Price'] \
        .quantile(q/100).reset_index().fillna(0.0)
    result.rename(columns={'Price': f'quantile_{q}'}, inplace=True)
    X_train = X_train.merge(result, on='Weight_Capacity', how='left')
    X_valid = X_valid.merge(result, on='Weight_Capacity', how='left')
    X_test = X_test.merge(result, on='Weight_Capacity', how='left')

## Target interaction with count encoding

In [17]:
category_counts = train_with_target['Weight_Capacity'].value_counts().reset_index()
category_counts.columns = ['Weight_Capacity', 'category_frequency']

category_means = train_with_target.groupby('Weight_Capacity')['Price'].mean().reset_index()
category_means.columns = ['Weight_Capacity', 'category_mean_price']

category_stats = category_counts.merge(category_means, on='Weight_Capacity', how='left')
category_stats['Count_Target_Interaction'] = category_stats['category_frequency'] * category_stats['category_mean_price']

X_train = X_train.merge(category_stats, on='Weight_Capacity', how='left')
X_valid = X_valid.merge(category_stats, on='Weight_Capacity', how='left')
X_test = X_test.merge(category_stats, on='Weight_Capacity', how='left')

In [18]:
# Initializing Prediction Arrays
oof_xgb = np.zeros(len(y))
oof_cat = np.zeros(len(y))
oof_lgm = np.zeros(len(y))
test_preds_xgb = np.zeros(len(X_test))
test_preds_cat = np.zeros(len(X_test))
test_preds_lgm = np.zeros(len(X_test))

In [19]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=5000, 
    early_stopping_rounds=200, 
    learning_rate=0.05, 
    eval_metric='rmse', 
    max_depth=5, 
    reg_lambda=8, 
    device='cuda',
    tree_method='gpu_hist',
    random_state=600,
)
xgb.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False,
)
xgb_pred = xgb.predict(X_valid)
oof_xgb[valid_index] = xgb_pred
test_preds_xgb += xgb.predict(X_test) / n_folds

In [20]:
from lightgbm import LGBMRegressor

lgm = LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.05,
    reg_lambda=1.05,
    min_split_gain=3,
    random_state=600,
    device='gpu',
    early_stopping_rounds=50,
    verbose=-1,
).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

lgm_pred = lgm.predict(X_valid)
oof_lgm[valid_index] = lgm_pred
test_preds_lgm += lgm.predict(X_test) / n_folds

In [21]:
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

cat = CatBoostRegressor(
    iterations=5000, 
    learning_rate=0.05,
    depth=7, 
    l2_leaf_reg=6.1, 
    task_type='GPU',
    devices='0',
    early_stopping_rounds=50,
    verbose=0,
    random_state=600,
    ).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

cat_pred = cat.predict(X_valid)
oof_cat[valid_index] = cat_pred
test_preds_cat += cat.predict(X_test) / n_folds



In [22]:
print(f'XGBoost OOF RMSE: {np.sqrt(mean_squared_error(y, oof_xgb))}')
print(f'LGMBoost OOF RMSE: {np.sqrt(mean_squared_error(y, oof_lgm))}')
print(f'CatBoost OOF RMSE: {np.sqrt(mean_squared_error(y, oof_cat))}')

XGBoost OOF RMSE: 85.49794886989712
LGMBoost OOF RMSE: 85.49841197539284
CatBoost OOF RMSE: 85.49772378158971


## Meta Model using Bayesian Ridge

In [23]:
meta_train = np.column_stack((oof_xgb, oof_lgm, oof_cat))
meta_test = np.column_stack((test_preds_xgb, test_preds_lgm, test_preds_cat))

In [24]:
from sklearn.linear_model import BayesianRidge

meta_model = BayesianRidge().fit(meta_train, y)

final_preds = meta_model.predict(meta_test)
output = pd.DataFrame({'id': test.id, 'price': final_preds})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
