In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import optuna

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/prediction-of-e-commerce-users/submission.csv
/kaggle/input/prediction-of-e-commerce-users/train_df.csv
/kaggle/input/prediction-of-e-commerce-users/test_df.csv


In [8]:
train = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/train_df.csv', parse_dates=['datetime'])
test = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/test_df.csv', parse_dates=['datetime'])

In [9]:
train.shape

(23376, 5)

In [4]:
test.shape

(2928, 4)

In [40]:
train.sample(8)

Unnamed: 0,datetime,e_users,promotion_1,promotion_2,promotion_3
21890,2024-07-01 02:00:00,39151,17.83,95.14,87.21
20570,2024-05-07 02:00:00,39195,13.13,57.57,82.38
23071,2024-08-19 07:00:00,38198,19.97,82.02,77.04
9695,2023-02-08 23:00:00,69176,11.29,91.32,80.51
17027,2023-12-11 11:00:00,75381,11.53,92.97,79.4
11322,2023-04-17 18:00:00,51071,17.39,83.99,65.05
11299,2023-04-16 19:00:00,57598,14.73,89.53,77.01
19178,2024-03-10 02:00:00,58402,11.4,89.85,84.88


In [8]:
train.isnull().sum()

datetime       0
e_users        0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [9]:
test.isnull().sum()

datetime       0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [11]:
train.describe()

Unnamed: 0,e_users,promotion_1,promotion_2,promotion_3
count,23376.0,23376.0,23376.0,23376.0
mean,52846.660207,15.598461,81.392785,74.453931
std,11911.319411,4.827803,14.24688,9.991424
min,29365.0,1.05,15.93,41.6
25%,43573.75,11.89,75.2,67.88
50%,51071.5,15.13,86.31,76.715
75%,61238.25,18.9725,91.64,82.36
max,96130.0,31.48,100.05,92.48


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   datetime     23376 non-null  object 
 1   e_users      23376 non-null  int64  
 2   promotion_1  23376 non-null  float64
 3   promotion_2  23376 non-null  float64
 4   promotion_3  23376 non-null  float64
dtypes: float64(3), int64(1), object(1)
memory usage: 913.3+ KB


In [10]:
def fe(df):
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    df['month'] = df['datetime'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    df['hour'] = df['datetime'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['week'] = df['datetime'].dt.isocalendar().week
    df['sin_week'] = np.sin(2 * np.pi * df['week'] / 52).astype(float)
    df['cos_week'] = np.cos(2 * np.pi * df['week'] / 52).astype(float)

    df['day'] = df['datetime'].dt.day
    df['year'] = df['datetime'].dt.year

    df['on_season'] = np.where(df['datetime'].dt.month.isin([11, 12, 1, 2, 3, 4]), 1, 0)

    df['is_off_week'] = np.where(df['datetime'].dt.dayofweek.isin([2,3,4]), 1, 0)

    for lag in [1, 7, 10, 14, 24, 48]:  
        for col in ['promotion_1', 'promotion_2', 'promotion_3']:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag).fillna(0.0)

    for period in [12, 24, 48, 72]:
        for col in features:
            df[f'{col}_mean_{period}'] = df[col].rolling(period).mean()
            df[f'{col}_std_{period}'] = df[col].rolling(period).std()
            df[f'{col}_max_{period}'] = df[col].rolling(period).max()
            df[f'{col}_min_{period}'] = df[col].rolling(period).min()
            df[f'{col}_median_{period}'] = df[col].rolling(period).median()
    
    df.drop(['datetime'], axis=1, inplace=True)

    return df

In [41]:
train.columns

Index(['datetime', 'e_users', 'promotion_1', 'promotion_2', 'promotion_3'], dtype='object')

In [11]:
features = ['promotion_1', 'promotion_2', 'promotion_3']

In [12]:
def add_interact(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]
            a = f'{c1}+{c2}'
            df[a] = df[c1] + df[c2]
            d = f'{c1}/{c2}'
            df[d] = df[c1] / (df[c2] + 1e-3)
            r = f'{c2}/{c1}'
            df[r] = df[c2] / (df[c1] + 1e-3)
    return df

In [13]:
X = train.copy()
y = X.pop('e_users')
X = fe(X)
X = add_interact(X)

X_test = test.copy()
X_test = fe(X_test)
X_test = add_interact(X_test)

In [37]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   promotion_1  23376 non-null  float64
 1   promotion_2  23376 non-null  float64
 2   promotion_3  23376 non-null  float64
 3   day_of_year  23376 non-null  int32  
 4   year_sin     23376 non-null  float64
 5   year_cos     23376 non-null  float64
 6   month        23376 non-null  int32  
 7   month_sin    23376 non-null  float64
 8   month_cos    23376 non-null  float64
 9   day          23376 non-null  int32  
 10  year         23376 non-null  int32  
dtypes: float64(7), int32(4)
memory usage: 1.6 MB


In [36]:
X.head()

Unnamed: 0,promotion_1,promotion_2,promotion_3,day_of_year,year_sin,year_cos,month,month_sin,month_cos,day,year
0,11.67,95.28,80.76,1,0.017213,0.999852,1,0.5,0.866025,1,2022
1,11.33,93.25,80.81,1,0.017213,0.999852,1,0.5,0.866025,1,2022
2,11.01,91.23,80.85,1,0.017213,0.999852,1,0.5,0.866025,1,2022
3,10.82,89.4,80.89,1,0.017213,0.999852,1,0.5,0.866025,1,2022
4,10.8,88.14,80.94,1,0.017213,0.999852,1,0.5,0.866025,1,2022


In [15]:
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler

tscv = TimeSeriesSplit(n_splits=5)
oof_xgb = np.zeros(len(y))
test_xgb = np.zeros(len(X_test))

def objective(trial):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 2000, 10000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'alpha': trial.suggest_float('alpha', 1e-5, 10, log=True),
        'lambda': trial.suggest_float('lambda', 1e-5, 10, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'eval_metric': 'rmse',
        'seed': 100
    }

    scores = []
    
    for fold, (train_index, valid_index) in enumerate(tscv.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index].copy(), X.iloc[valid_index].copy()
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
        xgb = XGBRegressor(**params, early_stopping_rounds=50, random_state=100).fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
        xgb_pred = xgb.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_error(y_valid, xgb_pred))
        scores.append(fold_rmse)
    
    return np.mean(scores)
    
study = optuna.create_study(direction='minimize', study_name='XGB-RMSE-Optimization')
study.optimize(objective, n_trials=5)

print(f'Best cross-validation RMSE: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')


[I 2025-06-09 08:58:19,087] A new study created in memory with name: XGB-RMSE-Optimization
[I 2025-06-09 08:58:26,394] Trial 0 finished with value: 4292.7316655625755 and parameters: {'n_estimators': 4646, 'learning_rate': 0.26977375693762073, 'max_depth': 4, 'min_child_weight': 1, 'alpha': 0.19406841680393572, 'lambda': 5.735851280490532, 'subsample': 0.8791897331495083, 'colsample_bytree': 0.7437421526712569, 'gamma': 0.01346503058910976}. Best is trial 0 with value: 4292.7316655625755.
[I 2025-06-09 08:58:46,377] Trial 1 finished with value: 4214.969028541517 and parameters: {'n_estimators': 8175, 'learning_rate': 0.19203218461804566, 'max_depth': 7, 'min_child_weight': 5, 'alpha': 1.5160428860905724, 'lambda': 0.0016559193964454184, 'subsample': 0.8618474062333319, 'colsample_bytree': 0.8777964463432684, 'gamma': 0.014515275480058482}. Best is trial 1 with value: 4214.969028541517.
[I 2025-06-09 08:59:16,341] Trial 2 finished with value: 4186.666136506118 and parameters: {'n_estima

Best cross-validation RMSE: 4,176.91088
Best parameters: {'n_estimators': 2862, 'learning_rate': 0.0527345743076874, 'max_depth': 8, 'min_child_weight': 9, 'alpha': 0.003265985324528557, 'lambda': 1.0121660329501618e-05, 'subsample': 0.7607076534219779, 'colsample_bytree': 0.8859135540062066, 'gamma': 0.00029614519126804946}


In [20]:
final_model = XGBRegressor(**study.best_params).fit(X, y)
test_xgb = final_model.predict(X_test)

In [21]:
sub = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/submission.csv')
sub['e_users'] = test_xgb
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(8))

Your submission was successfully saved!
              datetime       e_users
0  2024-09-01 00:00:00  48791.855469
1  2024-09-01 01:00:00  46189.179688
2  2024-09-01 02:00:00  45492.847656
3  2024-09-01 03:00:00  43279.503906
4  2024-09-01 04:00:00  42158.039062
5  2024-09-01 05:00:00  42030.113281
6  2024-09-01 06:00:00  42941.570312
7  2024-09-01 07:00:00  42360.406250
