In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

path = 'kaggle/input'

for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

kaggle/input\holidays_events.csv
kaggle/input\oil.csv
kaggle/input\sample_submission.csv
kaggle/input\stores.csv
kaggle/input\test.csv
kaggle/input\train.csv
kaggle/input\transactions.csv


In [85]:
train_df = pd.read_csv(os.path.join(path, 'train.csv'))
train_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [86]:
stores = pd.read_csv(os.path.join(path, 'stores.csv'))
stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [87]:
print(f'store_nbr: {np.sort(train_df['store_nbr'].unique())}\nlength: {len(train_df['store_nbr'].unique())}\n')

print(f'family: {np.sort(train_df['family'].unique())}\nlength: {len(train_df['family'].unique())}\n')

print(f'cluster: {np.sort(stores['cluster'].unique())}\n')

store_nbr: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54]
length: 54

family: ['AUTOMOTIVE' 'BABY CARE' 'BEAUTY' 'BEVERAGES' 'BOOKS' 'BREAD/BAKERY'
 'CELEBRATION' 'CLEANING' 'DAIRY' 'DELI' 'EGGS' 'FROZEN FOODS' 'GROCERY I'
 'GROCERY II' 'HARDWARE' 'HOME AND KITCHEN I' 'HOME AND KITCHEN II'
 'HOME APPLIANCES' 'HOME CARE' 'LADIESWEAR' 'LAWN AND GARDEN' 'LINGERIE'
 'LIQUOR,WINE,BEER' 'MAGAZINES' 'MEATS' 'PERSONAL CARE' 'PET SUPPLIES'
 'PLAYERS AND ELECTRONICS' 'POULTRY' 'PREPARED FOODS' 'PRODUCE'
 'SCHOOL AND OFFICE SUPPLIES' 'SEAFOOD']
length: 33

cluster: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]



In [88]:
oil = pd.read_csv(os.path.join(path, 'oil.csv'))
oil['date'] = pd.to_datetime(oil['date'])
oil['dcoilwtico'] = oil['dcoilwtico'].ffill()
oil['dcoilwtico'] = oil['dcoilwtico'].bfill()
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [89]:
holiday_events = pd.read_csv(os.path.join(path, 'holidays_events.csv'))
holiday_events['date'] = pd.to_datetime(holiday_events['date'])
holiday_events = holiday_events[holiday_events['transferred'] == False]
holiday_events['holiday'] = True
holiday_events.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred,holiday
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False,True
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False,True
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False,True
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False,True
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False,True


In [90]:
store_nbr_cat = train_df['store_nbr'].astype('category').cat
store_nbr_categories = store_nbr_cat.categories

family_cat = train_df['family'].astype('category').cat
family_categories = family_cat.categories

print(f'store_nbr categories: {store_nbr_categories}')
print(f'family categories: {family_categories}')

store_nbr categories: Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],
      dtype='int64')
family categories: Index(['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
       'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
       'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
       'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
       'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
       'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
       'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY', 'PREPARED FOODS',
       'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES', 'SEAFOOD'],
      dtype='object')


In [93]:
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.preprocessing import StandardScaler


def get_location(df):
    df = df.merge(stores[['store_nbr', 'state', 'city']], on='store_nbr', how='left')
    
    return df

def get_oil(df):
    oil = pd.read_csv(os.path.join(path, 'oil.csv'))
    oil['date'] = pd.to_datetime(oil['date'])
    
    df = df.merge(oil[['date', 'dcoilwtico']], on='date', how='left')
    
    df['dcoilwtico'] = df['dcoilwtico'].ffill()
    df['dcoilwtico'] = df['dcoilwtico'].bfill()
    
    return df

def get_holiday(df):
    holiday_events = pd.read_csv(os.path.join(path, 'holidays_events.csv'))
    holiday_events['date'] = pd.to_datetime(holiday_events['date'])

    holiday_events = holiday_events[holiday_events['transferred'] == False]

    holidays = holiday_events[holiday_events['type'].isin(['Holiday', 'Transfer'])].copy()
    holidays['holiday'] = True

    special_days = holiday_events[holiday_events['type'].isin(['Additional', 'Event', 'Bridge'])].copy()
    special_days['special_day'] = True

    # Merge holidays
    df = df.merge(holidays[holidays['locale'] == 'National'][['date', 'holiday']], on='date', how='left')
    df = df.merge(holidays[holidays['locale'] == 'Regional'][['date', 'locale_name', 'holiday']], left_on=['date', 'state'], right_on=['date', 'locale_name'], how='left')
    df = df.merge(holidays[holidays['locale'] == 'Local'][['date', 'locale_name', 'holiday']], left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left')
    
    df['holiday'] = df['holiday'] | df['holiday_y'] | df['holiday_x']
    df = df.drop(columns=['holiday_x', 'holiday_y', 'locale_name_x', 'locale_name_y'])
    df['holiday'] = df['holiday'].fillna(0).astype(int)

    df['before_holiday'] = df.groupby(['store_nbr'])['holiday'].shift(-1).fillna(0).astype(int)
    df['after_holiday'] = df.groupby(['store_nbr'])['holiday'].shift(1).fillna(0).astype(int)

    # Merge special days
    df = df.merge(special_days[special_days['locale'] == 'National'][['date', 'special_day']], on='date', how='left')
    df = df.merge(special_days[special_days['locale'] == 'Regional'][['date', 'locale_name', 'special_day']], left_on=['date', 'state'], right_on=['date', 'locale_name'], how='left')
    df = df.merge(special_days[special_days['locale'] == 'Local'][['date', 'locale_name', 'special_day']], left_on=['date', 'city'], right_on=['date', 'locale_name'], how='left')
    
    df['special_day'] = df['special_day'] | df['special_day_y'] | df['special_day_x']
    df = df.drop(columns=['special_day_x', 'special_day_y', 'locale_name_x', 'locale_name_y'])
    df['special_day'] = df['special_day'].fillna(0).astype(int)

    df['store_closed'] = ((df['date'].dt.month == 1) & (df['date'].dt.day == 1)).astype(int)
    
    return df

def get_seasonality(df, order=1, min_date=pd.to_datetime('2013-01-01'), max_date=pd.to_datetime('2017-08-31')):
    uniq_idx = pd.date_range(min_date, max_date, freq='D')

    fourier = CalendarFourier(freq='YE', order=order)

    dp = DeterministicProcess(
        index = uniq_idx,
        order = 0,
        constant = False,
        seasonal = True,
        additional_terms = [fourier],
        drop = False
    )

    det_feats = dp.in_sample()

    df = df.merge(det_feats, left_on='date', right_index=True, how='left')

    return df

def get_lags(df, lags = [1]):
    lag_names = [f'sales_lag_{lag}' for lag in lags]

    for lag in lags:
        df[f'sales_lag_{lag}'] = df.groupby(['store_nbr', 'family'])['sales'].shift(lag)
        df[f'sales_lag_{lag}'] = df[f'sales_lag_{lag}'].fillna(0)

    df['pct_change_cur'] = (
        df
        .groupby(['store_nbr', 'family'])['sales']
        .transform(lambda s: s.pct_change(periods=1))
    )
    df['pct_change_cur'] = df['pct_change_cur'].fillna(1)

    df['pct_change'] = df.groupby(['store_nbr', 'family'])['pct_change_cur'].shift(1)

    df = df.drop(columns=['pct_change_cur'])
    
    return df, lag_names

def get_windows(df, avg_windows=[7, 30], ):
    window_names = [f'sales_window_{window}' for window in avg_windows]

    for window in avg_windows:
        df[f'sales_window_{window}'] = df.groupby(['store_nbr', 'family'])['sales'].transform(lambda x: x.rolling(window=window, min_periods=window).mean())
        df[f'sales_window_{window}'] = df[f'sales_window_{window}'].fillna(0)

    def slope(arr):
        """Return the linear trend (Δ per day) of the array."""
        x = np.arange(len(arr))
        # polyfit on a 1-D array → coeffs[0] is slope
        return np.polyfit(x, arr, 1)[0] / arr.mean()

    # df['rolling_velocity'] = (
    #     df
    #     .groupby(['store_nbr', 'family'])['sales']
    #     .transform(lambda s: (
    #         s.rolling(window=7, min_periods=7)        # full 7-day window only
    #         .apply(slope, raw=False)                 # raw=False → gets a Series

    #         # windows that haven’t reached size 7 are NaN; keep or fill as you like
    #     ))
    # )

    # df['rolling_velocity'] = df['rolling_velocity'].fillna(0)

    # df['rolling_velocity'] = np.clip(df['rolling_velocity'], -1, 1)

    return df, window_names


def preprocess(df, proc_date=None, test=False):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])

    df = get_location(df)
    df = get_oil(df)
    df = get_holiday(df)
    df, lag_names = get_lags(df, lags=[1, 7])
    df.loc[df[lag_names].eq(0).all(axis=1), 'sales'] = np.nan
    df, window_names = get_windows(df, avg_windows=[7, 30])
    df = get_seasonality(df, order=4)

    df['store_nbr'] = df['store_nbr'].apply(lambda x: store_nbr_categories.get_loc(x))
    df['family'] = df['family'].apply(lambda x: family_categories.get_loc(x))

    if proc_date is not None:
        df = df[df['date'] == proc_date]

    if not test:
        df = df[~(df[lag_names].eq(0).all(axis=1))]
        df = df[~(df[window_names].eq(0).any(axis=1))]

    for col in lag_names + window_names + ['sales']:
        df[col] = np.log1p(df[col])

    df['dcoilwtico'] = (df['dcoilwtico'] - 67.8197) / 25.6708

    dates = df['date']
    df = df.drop(columns=['date', 'state', 'city', 'id'])

    return df, dates

In [94]:
train_sample = pd.read_csv(os.path.join(path, 'train.csv'))
train_sample = train_sample[(train_sample['store_nbr'] == 22) & (train_sample['family'] == 'HOME CARE')]
train_sample = train_sample.reset_index(drop=True)
train_sample, dates = preprocess(train_sample)

train_sample['date'] = dates
# train_sample.info()
train_sample['day_of_week'] = train_sample['date'].dt.day_name()
train_sample.to_csv('train_sample_preprocessed.csv', index=False)

In [None]:
train_sample = pd.read_csv(os.path.join(path, 'train.csv'))
train_sample = train_sample[(train_sample['store_nbr'] == 22) & (train_sample['family'] == 'HOME CARE')]
train_sample['date'] = pd.to_datetime(train_sample['date'])
train_sample = get_location(train_sample)
holiday_test = get_holiday(train_sample)

holiday_test.to_csv('holiday_test.csv', index=False)

In [None]:
# x = preprocess(train_df)
x = train_df.copy()
x['date'] = pd.to_datetime(x['date'])
x = x.merge(stores[['store_nbr', 'city', 'state']], on='store_nbr', how='left')
x = get_oil(x)
x = get_holiday(x)
# x.sample(1000, random_state=42).to_csv('train_preprocessed.csv', index=False)

In [None]:
combos = x[['store_nbr', 'family']].drop_duplicates().reset_index(drop=True)

combo_sample = x.merge(combos.sample(10, random_state=1), on=['store_nbr', 'family'], how='inner')

combo_sample.info()

In [None]:
combo_sample['date'] = pd.to_datetime(combo_sample['date'])
combo_sample = combo_sample[combo_sample['date'].dt.year == 2016]
combo_sample = combo_sample[combo_sample['date'].dt.month < 5]
groups = combo_sample.groupby(['store_nbr', 'family'])

fig, axs = plt.subplots(5, 2, figsize=(12, 15), sharex=True)
axs = axs.flatten()

for ax, ((store_nbr, family), group) in zip(axs, groups):
    group = group.sort_values('date')
    ax.plot(group['date'], group['sales'], marker='o', markersize=2)
    ax.set_title(f'Store {store_nbr}, Family: {family}')
    ax.tick_params(axis='x', rotation=45)
    ax.set_ylim(group['sales'].min(), group['sales'].max())

plt.tight_layout()
plt.show()

In [None]:
# Get the first group from the groups object and sort by date
first_key, first_group = list(groups)[1]
first_group = first_group.sort_values('date')
sales = first_group['sales'].reset_index(drop=True)

# Create lag plots for lags 1 through 7
fig, axes = plt.subplots(2, 4, figsize=(15, 8))
axes = axes.flatten()

for lag in range(1, 8): 
    shifted = sales.shift(lag)
    # Ensure we only compare valid (non-NaN) points
    valid = shifted.notna()
    axes[lag - 1].scatter(shifted[valid], sales[valid], s=20, alpha=0.7)
    axes[lag - 1].set_title(f'Lag {lag}')
    axes[lag - 1].set_xlabel(f'sales(t-{lag})')
    axes[lag - 1].set_ylabel('sales(t)')

# Disable the unused subplot
axes[-1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

plot_acf(sales, lags=7)
plt.show()

In [None]:
# Source: https://www.kaggle.com/code/ryanholbrook/seasonality

def plot_periodogram(ts, detrend='linear', ax=None):
    from scipy.signal import periodogram
    fs = pd.Timedelta("365D") / pd.Timedelta("1D")
    freqencies, spectrum = periodogram(
        ts,
        fs=fs,
        detrend=detrend,
        window="boxcar",
        scaling='spectrum',
    )
    if ax is None:
        _, ax = plt.subplots()
    ax.step(freqencies, spectrum, color="purple")
    ax.set_xscale("log")
    ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
    ax.set_xticklabels(
        [
            "Annual (1)",
            "Semiannual (2)",
            "Quarterly (4)",
            "Bimonthly (6)",
            "Monthly (12)",
            "Biweekly (26)",
            "Weekly (52)",
            "Semiweekly (104)",
        ],
        rotation=30,
    )
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel("Variance")
    ax.set_title("Periodogram")
    return ax

In [None]:
plot_periodogram(sales)

In [None]:
# test_x = pd.read_csv(os.path.join(path, 'test.csv'))
# test_x = preprocess(test_x, test_set=True)

# test_x.to_csv('test_preprocessed.csv', index=True)

In [None]:
train_x, dates = preprocess(train_df)
train_x.sample(10000, random_state=42).to_csv('train_preprocessed.csv', index=True)
train_x.info()

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

class CustomDataset(Dataset):
    def __init__(self, dataframe, target_col='sales'):
        self.dataframe = dataframe
        self.target_col = target_col
        self.features = dataframe.drop(columns=[target_col])
        self.targets = dataframe[target_col].values

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        vals = self.features.iloc[idx].values
        store_nbr = torch.tensor(vals[0], dtype=torch.long)
        family = torch.tensor(vals[1], dtype=torch.long)
        x_cont = torch.tensor(vals[2:6], dtype=torch.float32)
        x_time = torch.tensor(vals[6:], dtype=torch.float32)
        y = torch.tensor(self.targets[idx], dtype=torch.float32)
        return store_nbr, family, x_cont, x_time, y
    
def load_data(df, ttsplit=False, **kwargs):
    df, _ = preprocess(df)
    if ttsplit:
        train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
        train_dataset = CustomDataset(train_df)
        val_dataset = CustomDataset(val_df)
        return DataLoader(train_dataset, **kwargs), DataLoader(val_dataset, **kwargs)
    dataset = CustomDataset(df)
    return DataLoader(dataset, **kwargs)
    
class RMSLELoss(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, y_pred, y_true):
        log_y_pred = torch.log1p(y_pred)
        log_y_true = torch.log1p(y_true)
        loss = torch.mean((log_y_pred - log_y_true) ** 2)
        return torch.sqrt(loss)

In [None]:
# class Standardize(torch.nn.Module):
#     def __init__(self, mean, std):
#         super().__init__()
#         self.register_buffer("mean", torch.tensor(mean, dtype=torch.float32))
#         self.register_buffer("std",  torch.tensor(std,  dtype=torch.float32))

#     def forward(self, x):
#         return (x - self.mean) / self.std

class SalesModel(torch.nn.Module):
    def __init__(self, num_stores, store_embed, num_families, fam_embed, input_size):
        super().__init__()
        self.store_embedding = torch.nn.Embedding(num_stores, store_embed)
        self.family_embedding = torch.nn.Embedding(num_families, fam_embed)
        # self.standardize = Standardize(scaler.mean_.astype(np.float32), scaler.scale_.astype(np.float32))
        self.fc1 = torch.nn.Linear(store_embed + fam_embed + input_size, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, 1)

    def forward(self, store_nbr, family, x_cont, x_time):
        store_emb = self.store_embedding(store_nbr)
        family_emb = self.family_embedding(family)
        # x_stand = self.standardize(x_cont)
        x = torch.cat([store_emb, family_emb, x_cont, x_time], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x.squeeze()

In [None]:
from pathlib import Path
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from torch.nn import MSELoss


def train(
    exp_dir: str = "logs",
    num_epoch: int = 50,
    lr: float = 1e-3,
    batch_size: int = 128,
    seed: int = 42,
    **kwargs,
):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        print("CUDA not available, using CPU")
        device = torch.device("cpu")

    model_name = 'sales_model'

    # set random seed so each run is deterministic
    torch.manual_seed(seed)
    np.random.seed(seed)

    # directory with timestamp to save tensorboard logs and model checkpoints
    log_dir = Path(exp_dir) / f"{model_name}_{datetime.now().strftime('%m%d_%H%M%S')}"
    logger = SummaryWriter(log_dir)

    model = SalesModel(
        num_stores=54,
        store_embed=4,
        num_families=33,
        fam_embed=4,
        input_size=20,
    )
    model = model.to(device)
    model.train()

    df = pd.read_csv(os.path.join(path, 'train.csv'))
    train_data, val_data = load_data(df, ttsplit=True, shuffle=True, batch_size=batch_size, num_workers=0)

    # create loss function and optimizer
    loss_func = MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    global_step = 0

    print('Training started...\n')

    # training loop
    for epoch in range(num_epoch):
        metrics = {'train_loss': [], 'val_loss': []}

        for store_nbr, family, x_cont, x_time, y in train_data:
            store_nbr, family, x_cont, x_time = store_nbr.to(device), family.to(device), x_cont.to(device), x_time.to(device)
            y = y.to(device)

            pred = model(store_nbr, family, x_cont, x_time)
            loss = loss_func(pred, y)
            metrics["train_loss"].append(loss.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if global_step % 100 == 0:
                train_loss = torch.as_tensor(metrics["train_loss"]).mean()
                logger.add_scalar("train_loss", train_loss, global_step)
                if global_step % 5000 == 0:
                    print(f"Epoch: {epoch}, Step {global_step}: train_loss={train_loss:.4f}")
                metrics["train_loss"] = []

            global_step += 1

        with torch.inference_mode():
            for store_nbr, family, x_cont, x_time, y in val_data:
                store_nbr, family, x_cont, x_time = store_nbr.to(device), family.to(device), x_cont.to(device), x_time.to(device)
                y = y.to(device)

                pred = model(store_nbr, family, x_cont, x_time)
                loss = loss_func(pred, y)
                metrics['val_loss'].append(loss.item())

        epoch_train_loss = torch.as_tensor(metrics["train_loss"]).mean()
        epoch_val_loss = torch.as_tensor(metrics["val_loss"]).mean()

        logger.add_scalar("train_loss", epoch_train_loss, global_step)
        logger.add_scalar("val_loss", epoch_val_loss, global_step)

        print(
            f"Epoch {epoch + 1:2d} / {num_epoch:2d}: "
            f"train_loss={epoch_train_loss:.4f} "
            f"val_loss={epoch_val_loss:.4f}\n"
        )

    # save and overwrite the model in the root directory
    torch.save(model.state_dict(), f'{model_name}.th')

    # save a copy of model weights in the log directory
    torch.save(model.state_dict(), log_dir / f'{model_name}.th')
    
    print(f"Model saved to {log_dir / f'{model_name}.th'}")

In [None]:
def evaluate(
    model_path: str = "sales_model.th",
    batch_size: int = 128,
    **kwargs,
):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        print("CUDA not available, using CPU")
        device = torch.device("cpu")

    model = SalesModel(
        num_stores=54,
        store_embed=4,
        num_families=33,
        fam_embed=4,
        input_size=20,
    )
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    model.eval()

    train_df = pd.read_csv(os.path.join(path, 'train.csv'))
    test_df = pd.read_csv(os.path.join(path, 'test.csv'))
    test_df['sales'] = np.nan

    merged_df = pd.concat([train_df, test_df], ignore_index=True)
    merged_df['date'] = pd.to_datetime(merged_df['date'])

    proc_date = merged_df[merged_df['sales'].isna()]['date'].min()
    context_date = proc_date - pd.Timedelta(days=8)
    merged_df = merged_df[merged_df['date'] >= context_date]

    print(f'Context date: {context_date}')
    print(f'Processing date: {proc_date}')    

    df, _ = preprocess(merged_df, proc_date, test=True)
    test_data = DataLoader(CustomDataset(df, target_col='sales'), batch_size=batch_size, shuffle=False, num_workers=0)

    predictions = []
    
    with torch.inference_mode():
        for store_nbr, family, x_cont, x_time, _ in test_data:
            store_nbr, family, x_cont, x_time = store_nbr.to(device), family.to(device), x_cont.to(device), x_time.to(device)
            pred = model(store_nbr, family, x_cont, x_time)
            pred = torch.expm1(pred)
            predictions.append(pred.cpu().numpy())

    predictions = np.concatenate(predictions)
    merged_df.loc[merged_df['date'] == proc_date, 'sales'] = predictions
    
    return predictions

In [None]:
model_path = 'sales_model.th'

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print("CUDA not available, using CPU")
    device = torch.device("cpu")

model = SalesModel(
    num_stores=54,
    store_embed=4,
    num_families=33,
    fam_embed=4,
    input_size=20,
)
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)
model.eval()

train_df = pd.read_csv(os.path.join(path, 'train.csv'))
test_df = pd.read_csv(os.path.join(path, 'test.csv'))
test_df['sales'] = np.nan

merged_df = pd.concat([train_df, test_df], ignore_index=True)
merged_df['date'] = pd.to_datetime(merged_df['date'])

start_date = merged_df[merged_df['sales'].isna()]['date'].min()
end_date = merged_df['date'].max()
context_date = start_date - pd.Timedelta(days=31)
merged_df = merged_df[merged_df['date'] >= context_date]

print(f'Context date: {context_date}')

for proc_date in pd.date_range(start_date, end_date):
    print(f'Processing date: {proc_date}')

    df, _ = preprocess(merged_df, proc_date, test=True)
    # df = preprocess(merged_df, test=True)
    # df.to_excel('eval_processed.xlsx', index=False)
    # break
    test_data = DataLoader(CustomDataset(df, target_col='sales'), batch_size=128, shuffle=False, num_workers=0)

    predictions = []

    with torch.inference_mode():
        for store_nbr, family, x_cont, x_time, _ in test_data:
            store_nbr, family, x_cont, x_time = store_nbr.to(device), family.to(device), x_cont.to(device), x_time.to(device)
            pred = model(store_nbr, family, x_cont, x_time)
            pred = torch.expm1(pred)
            predictions.append(pred.cpu().numpy())

    predictions = np.concatenate(predictions)
    predictions = np.clip(predictions, 0, None)
    merged_df.loc[merged_df['date'] == proc_date, 'sales'] = predictions

merged_df.to_excel('evaluation.xlsx', index=False)

submission = merged_df[merged_df['date'] > pd.to_datetime('2017-08-15')][['id', 'sales']]
submission.to_csv('submission.csv', index=False)

In [None]:
z, _ = preprocess(train_df)

z = z[z['store_nbr'] == 0]
z = z[z['family'] == 30]

z.to_excel('eval_counter.xlsx', index=False)

In [None]:
train_df = pd.read_csv(os.path.join(path, 'train.csv'))
test_df = pd.read_csv(os.path.join(path, 'test.csv'))
test_df['sales'] = np.nan

merged_df = pd.concat([train_df, test_df], ignore_index=True)
merged_df['date'] = pd.to_datetime(merged_df['date'])

proc_date = merged_df[merged_df['sales'].isna()]['date'].min()
print(f'Processing date: {proc_date}')
context_date = proc_date - pd.Timedelta(days=10)
print(f'Context date: {context_date}')

merged_df = merged_df[merged_df['date'] >= context_date]

df = preprocess(merged_df, proc_date, test=True)

merged_df.to_excel('merged_df.xlsx', index=False, sheet_name='merged_preprocessed')
df.to_excel('eval_df.xlsx', index=False, sheet_name='df_preprocessed')

In [None]:
exp_dir = "logs"
num_epoch = 10
lr = 1e-3
batch_size = 256
seed = 42

train(
    exp_dir=exp_dir,
    num_epoch=num_epoch,
    lr=lr,
    batch_size=batch_size,
    seed=seed,
)