In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
#%load_ext cudf.pandas
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import optuna

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/prediction-of-e-commerce-users/submission.csv
/kaggle/input/prediction-of-e-commerce-users/train_df.csv
/kaggle/input/prediction-of-e-commerce-users/test_df.csv


In [2]:
train = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/train_df.csv', parse_dates=['datetime'])
test = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/test_df.csv', parse_dates=['datetime'])

In [3]:
train.shape

(23376, 5)

In [4]:
test.shape

(2928, 4)

In [5]:
train.sample(8)

Unnamed: 0,datetime,e_users,promotion_1,promotion_2,promotion_3
18822,2024-02-24 06:00:00,54888,11.7,91.69,86.58
3523,2022-05-27 19:00:00,41214,19.85,83.24,75.13
18029,2024-01-22 05:00:00,66994,7.51,78.05,83.15
3558,2022-05-29 06:00:00,39884,16.74,92.08,90.06
21200,2024-06-02 08:00:00,42979,20.75,74.03,67.03
20871,2024-05-19 15:00:00,45855,22.32,74.93,50.44
20015,2024-04-13 23:00:00,44128,12.84,92.28,73.92
79,2022-01-04 07:00:00,61658,14.33,96.19,84.29


In [6]:
train.isnull().sum()

datetime       0
e_users        0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [7]:
test.isnull().sum()

datetime       0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [8]:
train.describe()

Unnamed: 0,datetime,e_users,promotion_1,promotion_2,promotion_3
count,23376,23376.0,23376.0,23376.0,23376.0
mean,2023-05-02 23:29:59.999999744,52846.660207,15.598461,81.392785,74.453931
min,2022-01-01 00:00:00,29365.0,1.05,15.93,41.6
25%,2022-09-01 11:45:00,43573.75,11.89,75.2,67.88
50%,2023-05-02 23:30:00,51071.5,15.13,86.31,76.715
75%,2024-01-01 11:15:00,61238.25,18.9725,91.64,82.36
max,2024-08-31 23:00:00,96130.0,31.48,100.05,92.48
std,,11911.319411,4.827803,14.24688,9.991424


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     23376 non-null  datetime64[ns]
 1   e_users      23376 non-null  int64         
 2   promotion_1  23376 non-null  float64       
 3   promotion_2  23376 non-null  float64       
 4   promotion_3  23376 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 913.3 KB


In [10]:
features = ['promotion_1', 'promotion_2', 'promotion_3']

In [11]:
def fe(df):
    
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    df['month'] = df['datetime'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    df['hour'] = df['datetime'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['week'] = df['datetime'].dt.isocalendar().week
    df['sin_week'] = np.sin(2 * np.pi * df['week'] / 52).astype(float)
    df['cos_week'] = np.cos(2 * np.pi * df['week'] / 52).astype(float)

    df['dayofweek'] = df['datetime'].dt.dayofweek
    df['sin_dayofweek'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['cos_dayofweek'] = np.cos(2 * np.pi * df['dayofweek'] / 7)  

    for lag in [1, 7, 10, 14, 24]:  
        for col in ['promotion_1', 'promotion_2', 'promotion_3']:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag).fillna(0.0)
    

    df['day'] = df['datetime'].dt.day
    df['year'] = df['datetime'].dt.year

    df['on_season'] = np.where(df['datetime'].dt.month.isin([11, 12, 1, 2, 3, 4]), 1, 0)

    df['is_off_week'] = np.where(df['datetime'].dt.dayofweek.isin([2,3,4]), 1, 0)

    for period in [6, 12, 24, 48, 72]:
        for col in features:
            df[f'{col}_mean_{period}'] = df[col].rolling(period).mean()
            df[f'{col}_std_{period}'] = df[col].rolling(period).std()
            df[f'{col}_max_{period}'] = df[col].rolling(period).max()
            df[f'{col}_min_{period}'] = df[col].rolling(period).min()
            df[f'{col}_median_{period}'] = df[col].rolling(period).median()
    
    df.drop(['datetime'], axis=1, inplace=True)

    return df

In [12]:
train.columns

Index(['datetime', 'e_users', 'promotion_1', 'promotion_2', 'promotion_3'], dtype='object')

In [13]:
def add_interact(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]
            a = f'{c1}+{c2}'
            df[a] = df[c1] + df[c2]
            d = f'{c1}/{c2}'
            df[d] = df[c1] / (df[c2] + 1e-5)
    return df

In [14]:
X = train.copy()
y = X.pop('e_users')
X = fe(X)
X = add_interact(X)

X_test = test.copy()
X_test = fe(X_test)
X_test = add_interact(X_test)

oof_preds = {}
test_preds = {}

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Columns: 121 entries, promotion_1 to promotion_2/promotion_3
dtypes: UInt32(1), float64(112), int32(6), int64(2)
memory usage: 21.0 MB


In [16]:
X.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,promotion_1,promotion_2,promotion_3,day_of_year,year_sin,year_cos,month,month_sin,month_cos,hour,...,promotion_3_median_72,promotion_1*promotion_2,promotion_1+promotion_2,promotion_1/promotion_2,promotion_1*promotion_3,promotion_1+promotion_3,promotion_1/promotion_3,promotion_2*promotion_3,promotion_2+promotion_3,promotion_2/promotion_3
0,11.67,95.28,80.76,1,0.017213,0.999852,1,0.5,0.866025,0,...,,1111.9176,106.95,0.122481,942.4692,92.43,0.144502,7694.8128,176.04,1.179792
1,11.33,93.25,80.81,1,0.017213,0.999852,1,0.5,0.866025,1,...,,1056.5225,104.58,0.121501,915.5773,92.14,0.140205,7535.5325,174.06,1.153941
2,11.01,91.23,80.85,1,0.017213,0.999852,1,0.5,0.866025,2,...,,1004.4423,102.24,0.120684,890.1585,91.86,0.136178,7375.9455,172.08,1.128386
3,10.82,89.4,80.89,1,0.017213,0.999852,1,0.5,0.866025,3,...,,967.308,100.22,0.121029,875.2298,91.71,0.133762,7231.566,170.29,1.105204
4,10.8,88.14,80.94,1,0.017213,0.999852,1,0.5,0.866025,4,...,,951.912,98.94,0.122532,874.152,91.74,0.133432,7134.0516,169.08,1.088955


In [17]:
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler

tscv = TimeSeriesSplit(n_splits=5)
oof_lgbm = np.zeros(len(y))
test_lgbm = np.zeros(len(X_test))

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 12000),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 5, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 8.0),
        'max_depth': trial.suggest_int('max_depth', 1, 13),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 11),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.5, 1.0),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'goss']),
        'verbose': -1,
        'n_jobs': -1,
    }

    scores = []
    
    for fold, (train_index, valid_index) in enumerate(tscv.split(X, y), start=1):
        X_train, X_valid = X.iloc[train_index].copy(), X.iloc[valid_index].copy()
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
        lgbm = LGBMRegressor(**params, early_stopping_rounds=50, random_state=100).fit(X_train, y_train, eval_set=[(X_valid, y_valid)])
        lgbm_pred = lgbm.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_error(y_valid, lgbm_pred))
        scores.append(fold_rmse)
    
    return np.mean(scores)
    
study = optuna.create_study(direction='minimize', study_name='LGBM-RMSE-Optimization')
study.optimize(objective, n_trials=15)

print(f'Best cross-validation RMSE: {study.best_value:,.5f}')
print(f'Best parameters: {study.best_params}')


[I 2025-06-01 11:27:03,734] A new study created in memory with name: LGBM-RMSE-Optimization
[I 2025-06-01 11:29:47,686] Trial 0 finished with value: 3077.744978533651 and parameters: {'n_estimators': 9371, 'min_child_samples': 9, 'num_leaves': 32, 'learning_rate': 0.008540184005285069, 'min_split_gain': 0.28828392618322773, 'max_depth': 9, 'reg_alpha': 8.21052332120606, 'reg_lambda': 2.414847737842317, 'colsample_bytree': 0.5803833580558936, 'min_child_weight': 0.744541391700195, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 3077.744978533651.
[I 2025-06-01 11:30:58,288] Trial 1 finished with value: 3359.753490239296 and parameters: {'n_estimators': 8458, 'min_child_samples': 9, 'num_leaves': 182, 'learning_rate': 0.04147741571817897, 'min_split_gain': 0.32018411235077604, 'max_depth': 8, 'reg_alpha': 5.863331324008509, 'reg_lambda': 3.2937071914663805, 'colsample_bytree': 0.7569571709296823, 'min_child_weight': 0.6791799223293601, 'boosting_type': 'gbdt'}. Best is trial 0 with

Best cross-validation RMSE: 3,077.74498
Best parameters: {'n_estimators': 9371, 'min_child_samples': 9, 'num_leaves': 32, 'learning_rate': 0.008540184005285069, 'min_split_gain': 0.28828392618322773, 'max_depth': 9, 'reg_alpha': 8.21052332120606, 'reg_lambda': 2.414847737842317, 'colsample_bytree': 0.5803833580558936, 'min_child_weight': 0.744541391700195, 'boosting_type': 'gbdt'}


In [18]:
import json

with open('/kaggle/working/best_lgbm_params.json', 'w') as f:
    json.dump(study.best_params, f)

In [19]:
final_model = LGBMRegressor(**study.best_params).fit(X, y)
test_lgbm = final_model.predict(X_test)

In [20]:
sub = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/submission.csv')
sub['e_users'] = test_lgbm
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(8))

Your submission was successfully saved!
              datetime       e_users
0  2024-09-01 00:00:00  50296.725021
1  2024-09-01 01:00:00  47985.873109
2  2024-09-01 02:00:00  47363.870368
3  2024-09-01 03:00:00  44951.018478
4  2024-09-01 04:00:00  43564.323514
5  2024-09-01 05:00:00  42423.202305
6  2024-09-01 06:00:00  43324.454340
7  2024-09-01 07:00:00  44735.747604
