In [1]:
pip install pandas numpy holidays scikit-learn lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting holidays
  Using cached holidays-0.65-py3-none-any.whl.metadata (26 kB)
Using cached holidays-0.65-py3-none-any.whl (1.2 MB)
Installing collected packages: holidays
Successfully installed holidays-0.65
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import holidays
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb






In [4]:
# Step 2: Load data
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])

In [5]:
# STEP 2: Handle Missing Values
# Impute missing target values using group median
train['num_sold'] = train.groupby(['country', 'store', 'product'])['num_sold'] \
                       .transform(lambda x: x.fillna(x.median()))

In [6]:
# STEP 3: Add Basic Date Features
for df in [train, test]:
    # Weekend feature
    df['dayofweek'] = df['date'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype('int8')
    
    # Month/year features
    df['month'] = df['date'].dt.month.astype('int8')
    df['year'] = df['date'].dt.year.astype('int16')

In [7]:
# STEP 4: Add Holiday Features
country_holidays = {
    'Canada': holidays.CA(years=range(2010, 2021)),
    'Finland': holidays.FI(years=range(2010, 2021)),
    'Italy': holidays.IT(years=range(2010, 2021)),
    'Kenya': holidays.KE(years=range(2010, 2021)),
    'Norway': holidays.NO(years=range(2010, 2021)),
    'Singapore': holidays.SG(years=range(2010, 2021))
}

In [8]:
for df in [train, test]:
    df['is_holiday'] = 0
    for country, cal in country_holidays.items():
        country_mask = df['country'] == country
        df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')

  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].isin(cal).astype('int8')
  df.loc[country_mask, 'is_holiday'] = df.loc[country_mask, 'date'].is

In [9]:
# STEP 5: Add Seasonality Features
for df in [train, test]:
    # Yearly seasonality
    df['dayofyear'] = df['date'].dt.dayofyear.astype('int16')
    df['year_sin'] = np.sin(2 * np.pi * df['dayofyear']/365).astype('float32')
    df['year_cos'] = np.cos(2 * np.pi * df['dayofyear']/365).astype('float32')

In [10]:
# STEP 6: Memory-Efficient Lag Features
# Process training data in groups
def add_lags(group):
    group = group.sort_values('date')
    for lag in [7, 14, 28]:  # 1, 2, 4 weeks
        group[f'lag_{lag}'] = group['num_sold'].shift(lag).astype('float32')
    return group

In [12]:
train = train.groupby(['country', 'store', 'product'], group_keys=False).apply(add_lags)

# Get last lag values for test data
last_lags = train.groupby(['country', 'store', 'product']).last().reset_index()
test = test.merge(last_lags[['country', 'store', 'product', 'lag_7', 'lag_14', 'lag_28']],
                 on=['country', 'store', 'product'], how='left')

  train = train.groupby(['country', 'store', 'product'], group_keys=False).apply(add_lags)


In [13]:
# STEP 7: Optimized Rolling Features
def add_rolling(group):
    group = group.sort_values('date')
    for window in [7, 28]:  # 1 week and 4 weeks
        group[f'rolling_{window}'] = group['num_sold'].shift(1).rolling(window, min_periods=1).mean().astype('float32')
    return group

In [14]:
train = train.groupby(['country', 'store', 'product'], group_keys=False).apply(add_rolling)

# Get last rolling values for test data
last_rolling = train.groupby(['country', 'store', 'product']).last().reset_index()
test = test.merge(last_rolling[['country', 'store', 'product', 'rolling_7', 'rolling_28']],
                 on=['country', 'store', 'product'], how='left')


  train = train.groupby(['country', 'store', 'product'], group_keys=False).apply(add_rolling)


In [15]:
# STEP 8: Fill Remaining NAs
for col in ['lag_7', 'lag_14', 'lag_28', 'rolling_7', 'rolling_28']:
    test[col] = test[col].fillna(test[col].median())


In [16]:
# STEP 9: Encode Categorical Features
from sklearn.preprocessing import LabelEncoder

for col in ['country', 'store', 'product']:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]])
    le.fit(combined)
    train[col] = le.transform(train[col]).astype('int8')
    test[col] = le.transform(test[col]).astype('int8')

In [17]:
# STEP 10: Prepare Features
features = [
    'year', 'month', 'dayofweek', 'is_weekend', 'is_holiday',
    'year_sin', 'year_cos', 
    'lag_7', 'lag_14', 'lag_28',
    'rolling_7', 'rolling_28',
    'country', 'store', 'product'
]

In [18]:
X_train = train[features]
y_train = np.log1p(train['num_sold'])  # Log transform
X_test = test[features]

In [19]:
# STEP 11: Train Model
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb

tscv = TimeSeriesSplit(n_splits=3)
models = []

In [20]:
for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = lgb.LGBMRegressor(
        objective='regression',
        metric='mape',
        num_leaves=31,
        learning_rate=0.1,
        n_estimators=500,
        random_state=42
    )
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50, verbose=0)]
    )
    models.append(model)
    print(f"Fold {fold+1} trained")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1755
[LightGBM] [Info] Number of data points in the train set: 57534, number of used features: 15
[LightGBM] [Info] Start training from score 5.740918
Fold 1 trained
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002195 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1757
[LightGBM] [Info] Number of data points in the train set: 115066, number of used features: 15
[LightGBM] [Info] Start training from score 5.768754
Fold 2 trained
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003454 seconds.
You can set `force_row_wise=true` to remove the overh

In [21]:
# STEP 12: Generate Predictions
test_preds = np.mean([model.predict(X_test) for model in models], axis=0)
final_pred = np.expm1(test_preds)  # Convert from log scale

In [22]:
# Create submission
submission = pd.DataFrame({
    'id': test['id'],
    'num_sold': final_pred
})
submission.to_csv('submissiondeep.csv', index=False)