In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import optuna

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/prediction-of-e-commerce-users/submission.csv
/kaggle/input/prediction-of-e-commerce-users/train_df.csv
/kaggle/input/prediction-of-e-commerce-users/test_df.csv


In [2]:
train = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/train_df.csv', parse_dates=['datetime'])
test = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/test_df.csv', parse_dates=['datetime'])

In [3]:
train.shape

(23376, 5)

In [4]:
test.shape

(2928, 4)

In [5]:
train.sample(8)

Unnamed: 0,datetime,e_users,promotion_1,promotion_2,promotion_3
2333,2022-04-08 05:00:00,39385,11.7,89.19,80.79
18979,2024-03-01 19:00:00,63680,12.27,89.76,77.54
597,2022-01-25 21:00:00,65202,11.44,94.44,84.88
9424,2023-01-28 16:00:00,73756,10.91,85.08,71.03
20685,2024-05-11 21:00:00,44808,12.57,94.56,79.91
18789,2024-02-22 21:00:00,57758,12.65,90.73,78.78
4176,2022-06-24 00:00:00,43064,14.92,80.73,72.64
12223,2023-05-25 07:00:00,36971,16.1,89.86,78.97


In [6]:
train.isnull().sum()

datetime       0
e_users        0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [7]:
test.isnull().sum()

datetime       0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [8]:
train.describe()

Unnamed: 0,datetime,e_users,promotion_1,promotion_2,promotion_3
count,23376,23376.0,23376.0,23376.0,23376.0
mean,2023-05-02 23:29:59.999999744,52846.660207,15.598461,81.392785,74.453931
min,2022-01-01 00:00:00,29365.0,1.05,15.93,41.6
25%,2022-09-01 11:45:00,43573.75,11.89,75.2,67.88
50%,2023-05-02 23:30:00,51071.5,15.13,86.31,76.715
75%,2024-01-01 11:15:00,61238.25,18.9725,91.64,82.36
max,2024-08-31 23:00:00,96130.0,31.48,100.05,92.48
std,,11911.319411,4.827803,14.24688,9.991424


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     23376 non-null  datetime64[ns]
 1   e_users      23376 non-null  int64         
 2   promotion_1  23376 non-null  float64       
 3   promotion_2  23376 non-null  float64       
 4   promotion_3  23376 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 913.3 KB


In [10]:
def fe(df):
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    df['month'] = df['datetime'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    df['hour'] = df['datetime'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['week'] = df['datetime'].dt.isocalendar().week
    df['sin_week'] = np.sin(2 * np.pi * df['week'] / 52).astype(float)
    df['cos_week'] = np.cos(2 * np.pi * df['week'] / 52).astype(float)
    del df['week']

    df['day'] = df['datetime'].dt.day
    df['year'] = df['datetime'].dt.year

    df['on_season'] = np.where(df['datetime'].dt.month.isin([11, 12, 1, 2, 3, 4]), 1, 0)

    df['is_off_week'] = np.where(df['datetime'].dt.dayofweek.isin([2,3,4]), 1, 0)
    
    df.drop(['datetime'], axis=1, inplace=True)

    return df

In [11]:
train.columns

Index(['datetime', 'e_users', 'promotion_1', 'promotion_2', 'promotion_3'], dtype='object')

In [12]:
features = ['promotion_1', 'promotion_2', 'promotion_3']

In [13]:
def add_interact(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]
            a = f'{c1}+{c2}'
            df[a] = df[c1] + df[c2]
            d = f'{c1}/{c2}'
            df[d] = df[c1] / (df[c2] + 1e-3)
            r = f'{c2}/{c1}'
            df[r] = df[c2] / (df[c1] + 1e-3)
    return df

In [14]:
X = train.copy()
y = X.pop('e_users')
X = fe(X)
X = add_interact(X)

X_test = test.copy()
X_test = fe(X_test)
X_test = add_interact(X_test)

oof_preds = {}
test_preds = {}

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   promotion_1              23376 non-null  float64
 1   promotion_2              23376 non-null  float64
 2   promotion_3              23376 non-null  float64
 3   day_of_year              23376 non-null  int32  
 4   year_sin                 23376 non-null  float64
 5   year_cos                 23376 non-null  float64
 6   month                    23376 non-null  int32  
 7   month_sin                23376 non-null  float64
 8   month_cos                23376 non-null  float64
 9   hour                     23376 non-null  int32  
 10  hour_sin                 23376 non-null  float64
 11  hour_cos                 23376 non-null  float64
 12  sin_week                 23376 non-null  float64
 13  cos_week                 23376 non-null  float64
 14  day                   

In [16]:
X.head()

Unnamed: 0,promotion_1,promotion_2,promotion_3,day_of_year,year_sin,year_cos,month,month_sin,month_cos,hour,...,promotion_1/promotion_2,promotion_2/promotion_1,promotion_1*promotion_3,promotion_1+promotion_3,promotion_1/promotion_3,promotion_3/promotion_1,promotion_2*promotion_3,promotion_2+promotion_3,promotion_2/promotion_3,promotion_3/promotion_2
0,11.67,95.28,80.76,1,0.017213,0.999852,1,0.5,0.866025,0,...,0.12248,8.163825,942.4692,92.43,0.1445,6.919716,7694.8128,176.04,1.179777,0.847598
1,11.33,93.25,80.81,1,0.017213,0.999852,1,0.5,0.866025,1,...,0.1215,8.229636,915.5773,92.14,0.140204,7.131762,7535.5325,174.06,1.153927,0.866586
2,11.01,91.23,80.85,1,0.017213,0.999852,1,0.5,0.866025,2,...,0.120683,8.285351,890.1585,91.86,0.136176,7.342657,7375.9455,172.08,1.128372,0.886212
3,10.82,89.4,80.89,1,0.017213,0.999852,1,0.5,0.866025,3,...,0.121028,8.261713,875.2298,91.71,0.13376,7.47528,7231.566,170.29,1.105191,0.9048
4,10.8,88.14,80.94,1,0.017213,0.999852,1,0.5,0.866025,4,...,0.122531,8.160356,874.152,91.74,0.133431,7.493751,7134.0516,169.08,1.088941,0.918301


In [17]:
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler

tscv = TimeSeriesSplit(n_splits=3)
oof_xgb = np.zeros(len(y))
test_xgb = np.zeros(len(X_test))

def objective(trial):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 500, 10000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'alpha': trial.suggest_float('alpha', 1e-3, 10, log=True),
        'lambda': trial.suggest_float('lambda', 1e-3, 10, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'eval_metric': 'rmse',
        'seed': 100
    }

    scores = []
    
    for fold, (train_index, valid_index) in enumerate(tscv.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index].copy(), X.iloc[valid_index].copy()
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
        xgb = XGBRegressor(**params, early_stopping_rounds=50, random_state=100).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
        xgb_pred = xgb.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_error(y_valid, xgb_pred))
        scores.append(fold_rmse)
    
    return np.mean(scores)
    
study = optuna.create_study(direction='minimize', study_name='XGB-RMSE-Optimization')
study.optimize(objective, n_trials=20)

print(f'Best cross-validation RMSE: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')


[I 2025-06-01 10:14:40,200] A new study created in memory with name: XGB-RMSE-Optimization
[I 2025-06-01 10:14:45,203] Trial 0 finished with value: 5666.36947834308 and parameters: {'n_estimators': 1757, 'learning_rate': 0.05293766275028138, 'max_depth': 9, 'min_child_weight': 5, 'alpha': 2.904366088438459, 'lambda': 0.5824824169679047, 'subsample': 0.8990037232339464, 'colsample_bytree': 0.542351218854296, 'gamma': 0.0058640317644126}. Best is trial 0 with value: 5666.36947834308.
[I 2025-06-01 10:14:48,539] Trial 1 finished with value: 5455.99483738274 and parameters: {'n_estimators': 3414, 'learning_rate': 0.03839347971685261, 'max_depth': 6, 'min_child_weight': 3, 'alpha': 0.1793695607552481, 'lambda': 1.913832546623206, 'subsample': 0.9160907081896379, 'colsample_bytree': 0.5959275419560863, 'gamma': 6.877353589871887e-06}. Best is trial 1 with value: 5455.99483738274.
[I 2025-06-01 10:14:49,667] Trial 2 finished with value: 6068.377665595668 and parameters: {'n_estimators': 778, 

Best cross-validation RMSE: 5,038.79968
Best parameters: {'n_estimators': 6997, 'learning_rate': 0.10475586215396379, 'max_depth': 4, 'min_child_weight': 5, 'alpha': 0.8073042254123297, 'lambda': 3.4289739927114273, 'subsample': 0.9008408451716037, 'colsample_bytree': 0.5072475787712835, 'gamma': 5.15257817272365e-05}


In [18]:
final_model = XGBRegressor(**study.best_params).fit(X, y)
test_xgb = final_model.predict(X_test)

In [19]:
sub = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/submission.csv')
sub['e_users'] = test_xgb
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(8))

Your submission was successfully saved!
              datetime       e_users
0  2024-09-01 00:00:00  41241.968750
1  2024-09-01 01:00:00  37196.851562
2  2024-09-01 02:00:00  35705.703125
3  2024-09-01 03:00:00  32623.773438
4  2024-09-01 04:00:00  31936.003906
5  2024-09-01 05:00:00  32502.173828
6  2024-09-01 06:00:00  35552.734375
7  2024-09-01 07:00:00  37682.941406
