In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/forecast-datframe/forecast_complete_data.csv
/kaggle/input/items-orders/Orders data.xlsx
/kaggle/input/items-orders/Associated_Item_data.csv


In [2]:
result = pd.read_csv(r'/kaggle/input/forecast-datframe/forecast_complete_data.csv')

In [3]:
# Define forecast horizon (14 days after April 10th)
forecast_start_date = pd.to_datetime('2025-04-11')
forecast_end_date = pd.to_datetime('2025-04-24')
future_dates = pd.date_range(start=forecast_start_date, end=forecast_end_date, freq='D')

In [4]:
customer_items = result[['customer_id', 'order_item_id']].drop_duplicates()

In [5]:
# Repeat customer-items for each future date
customer_repeated = np.repeat(customer_items['customer_id'].values, len(future_dates))
item_repeated = np.repeat(customer_items['order_item_id'].values, len(future_dates))

# Tile future_dates for each customer-item
dates_tiled = np.tile(future_dates, len(customer_items))

# Create directly from numpy arrays
future_df = pd.DataFrame({
    'customer_id': customer_repeated,
    'order_item_id': item_repeated,
    'order_date': dates_tiled
})

In [6]:
future_df['is_weekend'] = future_df['order_date'].dt.dayofweek.isin([5, 6]).astype(int)

In [7]:
for day in range(7):
    future_df[day] = (future_df['order_date'].dt.dayofweek == day).astype(int)

In [8]:
import holidays

india_holidays = holidays.India()

# Convert holiday dates to a set for O(1) lookup
holiday_dates = set(india_holidays)

# Vectorized check
future_df['holiday'] = future_df['order_date'].isin(holiday_dates).astype(int)

In [9]:
X_train = result[result['order_date'] < '2025-03-01'].iloc[:,5:]
X_test = result[result['order_date'] >= '2025-03-01'].iloc[:,5:]

In [10]:
y_train = result[result['order_date'] < '2025-03-01'].iloc[:,4]

In [11]:
# For gradient boosting approach
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [12]:
model = LGBMRegressor(n_estimators=100, learning_rate=0.05)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.941116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 740
[LightGBM] [Info] Number of data points in the train set: 17494136, number of used features: 17
[LightGBM] [Info] Start training from score 0.034840


In [13]:
def prepare_forecast_features(result, future_df, feature_cols, default_strategy='mean'):
    
    # 1. Get latest features from training data
    grouped = result.sort_values('order_date').groupby(
        ['customer_id', 'order_item_id'], 
        as_index=False
    ).last()
    
    # 2. Merge with future template
    final_df = future_df.merge(
        grouped[['customer_id', 'order_item_id'] + feature_cols],
        on=['customer_id', 'order_item_id'],
        how='left'
    )
    
    
    return final_df


feature_columns = ['rolling_std_14d', 'is_std_imputed', 'discount_percentage','lag_1_day','lag_7_days'
                   ,'lag_14_days','lag_30_days','rolling_mean_7d','rolling_sum_30d']



forecast_features = prepare_forecast_features(
    result=result,
    future_df=future_df,
    feature_cols=feature_columns
)

In [14]:
test_model = forecast_features.iloc[:,3:]

In [15]:
test_model = test_model.drop(6, axis = 1, errors = 'ignore')

In [16]:
test_pred = model.predict(test_model)
test_pred

array([4.10740469e-04, 4.10740469e-04, 4.10740469e-04, ...,
       7.79091759e-01, 7.79091759e-01, 1.64093578e+00])

In [17]:
test_quantities = pd.DataFrame(np.round(test_pred))
test_quantities

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
1611297,1.0
1611298,1.0
1611299,1.0
1611300,1.0


In [18]:
test_quantities.to_csv('forecasted_quantities.csv')