In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/prediction-of-e-commerce-users/submission.csv
/kaggle/input/prediction-of-e-commerce-users/train_df.csv
/kaggle/input/prediction-of-e-commerce-users/test_df.csv


In [2]:
train = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/train_df.csv', parse_dates=['datetime'])
test = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/test_df.csv', parse_dates=['datetime'])

In [3]:
train.shape

(23376, 5)

In [4]:
test.shape

(2928, 4)

In [5]:
train.sample(8)

Unnamed: 0,datetime,e_users,promotion_1,promotion_2,promotion_3
20881,2024-05-20 01:00:00,39902,14.54,78.56,75.64
12903,2023-06-22 15:00:00,43455,23.58,78.84,58.75
16684,2023-11-27 04:00:00,49039,13.46,96.38,86.84
15002,2023-09-18 02:00:00,39062,16.53,67.26,77.1
4780,2022-07-19 04:00:00,38320,16.86,74.73,78.62
14831,2023-09-10 23:00:00,47175,14.72,75.09,77.08
2510,2022-04-15 14:00:00,45704,17.88,86.97,60.85
8739,2022-12-31 03:00:00,56034,9.57,95.14,85.15


In [6]:
train.isnull().sum()

datetime       0
e_users        0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [7]:
test.isnull().sum()

datetime       0
promotion_1    0
promotion_2    0
promotion_3    0
dtype: int64

In [8]:
train.describe()

Unnamed: 0,datetime,e_users,promotion_1,promotion_2,promotion_3
count,23376,23376.0,23376.0,23376.0,23376.0
mean,2023-05-02 23:29:59.999999744,52846.660207,15.598461,81.392785,74.453931
min,2022-01-01 00:00:00,29365.0,1.05,15.93,41.6
25%,2022-09-01 11:45:00,43573.75,11.89,75.2,67.88
50%,2023-05-02 23:30:00,51071.5,15.13,86.31,76.715
75%,2024-01-01 11:15:00,61238.25,18.9725,91.64,82.36
max,2024-08-31 23:00:00,96130.0,31.48,100.05,92.48
std,,11911.319411,4.827803,14.24688,9.991424


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   datetime     23376 non-null  datetime64[ns]
 1   e_users      23376 non-null  int64         
 2   promotion_1  23376 non-null  float64       
 3   promotion_2  23376 non-null  float64       
 4   promotion_3  23376 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 913.3 KB


In [10]:
def fe(df):
    df['day_of_year'] = df['datetime'].dt.dayofyear
    df['year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    
    df['month'] = df['datetime'].dt.month
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    df['hour'] = df['datetime'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['day'] = df['datetime'].dt.day
    df['year'] = df['datetime'].dt.year
    
    df.drop(['datetime'], axis=1, inplace=True)

    return df

In [11]:
train.columns

Index(['datetime', 'e_users', 'promotion_1', 'promotion_2', 'promotion_3'], dtype='object')

In [12]:
features = ['promotion_1', 'promotion_2', 'promotion_3']

In [13]:
def add_interact(df):
    for i, c1 in enumerate(features):
        for c2 in (features[i+1:]):
            m = f'{c1}*{c2}'
            df[m] = df[c1] * df[c2]
            d = f'{c1}/{c2}'
            df[d] = df[c1] / (df[c2] + 1e-3)
            r = f'{c2}/{c1}'
            df[r] = df[c2] / (df[c1] + 1e-3)
    return df

In [14]:
X = train.copy()
y = X.pop('e_users')
X = fe(X)
X = add_interact(X)

X_test = test.copy()
X_test = fe(X_test)
X_test = add_interact(X_test)

oof_preds = {}
test_preds = {}

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23376 entries, 0 to 23375
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   promotion_1              23376 non-null  float64
 1   promotion_2              23376 non-null  float64
 2   promotion_3              23376 non-null  float64
 3   day_of_year              23376 non-null  int32  
 4   year_sin                 23376 non-null  float64
 5   year_cos                 23376 non-null  float64
 6   month                    23376 non-null  int32  
 7   month_sin                23376 non-null  float64
 8   month_cos                23376 non-null  float64
 9   hour                     23376 non-null  int32  
 10  hour_sin                 23376 non-null  float64
 11  hour_cos                 23376 non-null  float64
 12  day                      23376 non-null  int32  
 13  year                     23376 non-null  int32  
 14  promotion_1*promotion_

In [16]:
X.head()

Unnamed: 0,promotion_1,promotion_2,promotion_3,day_of_year,year_sin,year_cos,month,month_sin,month_cos,hour,...,year,promotion_1*promotion_2,promotion_1/promotion_2,promotion_2/promotion_1,promotion_1*promotion_3,promotion_1/promotion_3,promotion_3/promotion_1,promotion_2*promotion_3,promotion_2/promotion_3,promotion_3/promotion_2
0,11.67,95.28,80.76,1,0.017213,0.999852,1,0.5,0.866025,0,...,2022,1111.9176,0.12248,8.163825,942.4692,0.1445,6.919716,7694.8128,1.179777,0.847598
1,11.33,93.25,80.81,1,0.017213,0.999852,1,0.5,0.866025,1,...,2022,1056.5225,0.1215,8.229636,915.5773,0.140204,7.131762,7535.5325,1.153927,0.866586
2,11.01,91.23,80.85,1,0.017213,0.999852,1,0.5,0.866025,2,...,2022,1004.4423,0.120683,8.285351,890.1585,0.136176,7.342657,7375.9455,1.128372,0.886212
3,10.82,89.4,80.89,1,0.017213,0.999852,1,0.5,0.866025,3,...,2022,967.308,0.121028,8.261713,875.2298,0.13376,7.47528,7231.566,1.105191,0.9048
4,10.8,88.14,80.94,1,0.017213,0.999852,1,0.5,0.866025,4,...,2022,951.912,0.122531,8.160356,874.152,0.133431,7.493751,7134.0516,1.088941,0.918301


In [17]:
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler

tscv = TimeSeriesSplit(n_splits=5)
oof_xgb = np.zeros(len(y))
test_xgb = np.zeros(len(X_test))

for fold, (train_index, valid_index) in enumerate(tscv.split(X, y), start=1):
    X_train, X_valid = X.iloc[train_index].copy(), X.iloc[valid_index].copy()
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    
    xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, objective='reg:squarederror', eval_metric='rmse').fit(X_train, y_train)
    valid_pred = xgb.predict(X_valid)
    oof_xgb[valid_index] = valid_pred
    test_xgb += xgb.predict(X_test) / 5

    fold_rmse = np.sqrt(mean_squared_error(y_valid, valid_pred))
    print(f"Fold {fold} RMSE: {fold_rmse:.6f}")
    
oof_preds['XGB'] = oof_xgb
test_preds['XGB'] = test_xgb

overall_rmse = np.sqrt(mean_squared_error(y, oof_xgb))
print(f"\nOverall OOF RMSLE: {overall_rmse:.7f}")  


Fold 1 RMSE: 6546.547806
Fold 2 RMSE: 7034.829956
Fold 3 RMSE: 4310.793711
Fold 4 RMSE: 5289.315027
Fold 5 RMSE: 4453.972422

Overall OOF RMSLE: 25054.2131490


In [18]:
sub = pd.read_csv('/kaggle/input/prediction-of-e-commerce-users/submission.csv')
sub['e_users'] = test_xgb
sub.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")
print(sub.head(8))

Your submission was successfully saved!
              datetime       e_users
0  2024-09-01 00:00:00  43031.228516
1  2024-09-01 01:00:00  39281.670410
2  2024-09-01 02:00:00  37808.028809
3  2024-09-01 03:00:00  35372.660156
4  2024-09-01 04:00:00  34019.045898
5  2024-09-01 05:00:00  34032.298828
6  2024-09-01 06:00:00  35496.968750
7  2024-09-01 07:00:00  37363.539551
