In [2]:
import numpy as np 
import pandas as pd 
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
from pathlib import Path
from torch.autograd import Variable
import random
import math
from tqdm.auto import tqdm
import os
from datetime import datetime, timedelta
file_root = Path('/home/scc/Downloads/playData')

        
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
def all_seed(seed = 6666):
    """
    设置随机种子
    """
    np.random.seed(seed)
    random.seed(seed)
    # CPU
    torch.manual_seed(seed) 
    # GPU
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed) 
    # python 全局
    os.environ['PYTHONHASHSEED'] = str(seed) 
    # cudnn
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False
    print(f'Set env random_seed = {seed}')

In [10]:
all_seed(2023)

Set env random_seed = 2023


# Data Review

In [4]:
tr_df = pd.read_csv(file_root.joinpath('train.csv'))
te_df = pd.read_csv(file_root.joinpath('test.csv'))
print(te_df.shape[0]/5/3/5)
print(tr_df.shape[0]/5/3/5/365)
print(tr_df.num_sold.min())
display(tr_df.head())
te_df.head()

365.0
5.002739726027397
2


Unnamed: 0,id,date,country,store,product,num_sold
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49


Unnamed: 0,id,date,country,store,product
0,136950,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding
1,136951,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs
2,136952,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People
3,136953,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions
4,136954,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better


In [6]:
country_str2code = dict(zip(
    sorted(tr_df.country.unique()),
    range(tr_df.country.nunique())
))
store_str2code = dict(zip(
    sorted(tr_df.store.unique()),
    range(tr_df.store.nunique())
))
product_str2code = dict(zip(
    sorted(tr_df['product'].unique()),
    range(tr_df['product'].nunique())
))

print(country_str2code, '\n', store_str2code, '\n', product_str2code)
tr_df.country.value_counts(), tr_df.store.value_counts(), tr_df['product'].value_counts()

{'Argentina': 0, 'Canada': 1, 'Estonia': 2, 'Japan': 3, 'Spain': 4} 
 {'Kagglazon': 0, 'Kaggle Learn': 1, 'Kaggle Store': 2} 
 {'Using LLMs to Improve Your Coding': 0, 'Using LLMs to Train More LLMs': 1, 'Using LLMs to Win Friends and Influence People': 2, 'Using LLMs to Win More Kaggle Competitions': 3, 'Using LLMs to Write Better': 4}


(Argentina    27390
 Canada       27390
 Estonia      27390
 Japan        27390
 Spain        27390
 Name: country, dtype: int64,
 Kaggle Learn    45650
 Kaggle Store    45650
 Kagglazon       45650
 Name: store, dtype: int64,
 Using LLMs to Improve Your Coding                 27390
 Using LLMs to Train More LLMs                     27390
 Using LLMs to Win Friends and Influence People    27390
 Using LLMs to Win More Kaggle Competitions        27390
 Using LLMs to Write Better                        27390
 Name: product, dtype: int64)

# DataLoader

In [8]:
PRED_LEN = 56

In [7]:
def generate_time_slice(df, his_len=60, pred_len=7):
    time_slices = sorted(df['date_time'].unique())
    train_slice_list = []
    for st, ed in zip(range(len(time_slices) - his_len), range(his_len - 1, len(time_slices) - pred_len)):
        train_slice_list.append([time_slices[st], time_slices[ed], time_slices[ed + pred_len]])
    return train_slice_list


class regDataset(Dataset):
    def __init__(self, pd_df, slice_list, his_len=60, pred_len=7):
        super(regDataset, self).__init__()
        self.his_len = his_len
        self.pred_len = pred_len
        self.df = pd_df.set_index('date_time')
        self.slice_list = slice_list
         
    def __len__(self):
        return len(self.slice_list)
    
    def __getitem__(self,idx):
        sl = self.slice_list[0]
        tr_sl = self.df.loc[sl[0]:sl[1], ['tp', 'tp_id', 'num_sold']].reset_index().sort_values(by=['date_time', 'tp'], ignore_index=True) 
        lb_sl = self.df.loc[sl[1]+1:sl[2], ['tp', 'tp_id', 'num_sold']].reset_index().sort_values(by=['date_time', 'tp'], ignore_index=True) 
        series = torch.Tensor(tr_sl[['num_sold']].values.reshape(-1, 75)).float()
        labels = torch.Tensor(lb_sl[['num_sold']].values.reshape(-1, 75)).float()
        
        aa = tr_sl[['tp_id']].values.reshape(-1, 75)[0]
        emb_ = np.stack([pd.DataFrame(aa)[0].str.split('&&', expand=True).values.astype(int) for _ in range(self.his_len)])
        return torch.Tensor(emb_).long(), series, labels

In [9]:
tr_df['country_id'] = tr_df['country'].map(country_str2code)
tr_df['store_id'] = tr_df['store'].map(store_str2code)
tr_df['product_id'] = tr_df['product'].map(product_str2code)
tr_df['date_time'] = pd.to_datetime(tr_df['date'], format="%Y-%m-%d")
tr_df['tp_id'] = tr_df.country_id.map(str) + '&&'\
                + tr_df.store_id.map(str)   + "&&"\
                + tr_df['product_id'].map(str) + "&&"\
                + tr_df['date_time'].dt.day.map(str) + "&&"\
                + tr_df['date_time'].dt.dayofweek.map(str) 
tr_df['tp'] = tr_df.country + '&&' + tr_df.store  + "&&" + tr_df['product']
total_slice_list = generate_time_slice(tr_df, pred_len=PRED_LEN)

In [11]:
np.random.shuffle(total_slice_list)
tr_len = int(len(total_slice_list) * 0.8)

In [12]:
tr_slice = total_slice_list[:tr_len]
val_slice = total_slice_list[tr_len:]
tr_dataset = regDataset(tr_df, tr_slice, his_len=60, pred_len=PRED_LEN)
val_dataset = regDataset(tr_df, val_slice, his_len=60, pred_len=PRED_LEN)

tr_dataloader = DataLoader(tr_dataset, shuffle=True, batch_size=24)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=24)

In [13]:
emb, series_, label = tr_dataset[1]
emb.shape, series_.shape, label.shape

(torch.Size([60, 75, 5]), torch.Size([60, 75]), torch.Size([56, 75]))

# Model Archi

In [14]:
class regNet(nn.Module):
    def __init__(self, pred_len=7):
        super(regNet, self).__init__()
        self.pred_len = pred_len
        self.city_emb = nn.Embedding(5, 4)
        self.store_emb = nn.Embedding(3, 4)
        self.product_emb = nn.Embedding(5, 8)
        self.day_of_month_emb = nn.Embedding(32, 6)
        self.day_of_week_emb = nn.Embedding(8, 6)
        self.pos_layer_norm = nn.LayerNorm(12, eps=1e-12)

        encoder_norm = nn.LayerNorm(75, eps=1e-5)
        encoder_layer = nn.TransformerEncoderLayer(d_model=75, nhead=5, norm_first=True, batch_first=True)
        self.tf_encode = nn.TransformerEncoder(encoder_layer, num_layers=2, norm=encoder_norm)
        
        decode_norm = nn.LayerNorm(75, eps=1e-5)
        decode_layer = nn.TransformerDecoderLayer(d_model=75, nhead=5, norm_first=True, batch_first=True)
        self.tf_decode = nn.TransformerDecoder(decode_layer, num_layers=2, norm=decode_norm)
        self.out_w = nn.Parameter(torch.randn(60, self.pred_len), requires_grad=True)
        self.out_bais = nn.Parameter(torch.zeros(self.pred_len), requires_grad=True)

    def forward(self, emb_ipt, seires_ipt):
        # emb_ipt [b, 60, 75, 3]
        city_res, store_res, product_res = [], [], []
        day_res, week_res = [], []
        batch_size = emb_ipt.shape[0]
        for i in range(batch_size):
            city_res.append(self.city_emb(emb_ipt[i, :, :, 0])[:, 0, :])
            store_res.append(self.store_emb(emb_ipt[i, :, :, 1])[:, 0, :])
            product_res.append(self.product_emb(emb_ipt[i, :, :, 2])[:, 0, :])
            day_res.append(self.day_of_month_emb(emb_ipt[i, :, :, 3])[:, 0, :])
            week_res.append(self.day_of_week_emb(emb_ipt[i, :, :, 4])[:, 0, :])
        
        city = torch.stack(city_res)
        store = torch.stack(store_res)
        product = torch.stack(product_res)
        day = torch.stack(day_res)
        week = torch.stack(week_res)
        # [b, 60, 4+4+8] 
        enity_emb = torch.cat([city, store, product], dim=-1)
        
        pos_emb = self.pos_layer_norm(torch.cat([day, week], dim=-1))
        seires_ipt = torch.einsum('ble,bln->bln', [pos_emb, seires_ipt])
        # [b, 60, 75] -> [b, 60, 75]
        encode_out = self.tf_encode(seires_ipt)
        seq_len = encode_out.size(1)
        b = encode_out.size(0)
        # n, nhead, len, len
        mask = torch.stack([torch.tril(torch.ones(seq_len, seq_len)) for i in range(5 * b)]).to(seires_ipt.device)
        # print('encode_out.shape=', encode_out.shape, ", mask.shape=", mask.shape)
        decode_out = self.tf_decode(encode_out, encode_out[:, -1, :].unsqueeze(1), tgt_mask=mask)
        # [b, 60, 75] -> [b, 60, 75] -> [b, 7, 75]
        decode_out = torch.einsum('ble,bln->bln', [enity_emb, decode_out])
        res = torch.einsum("bln,lp->bnp", [decode_out, self.out_w]) + self.out_bais
        return torch.relu(torch.einsum("bnp->bpn", res))

In [15]:
# test
emb, series_, label = tr_dataset[1]
print(emb.shape, series_.shape, label.shape)
reg = regNet(PRED_LEN).to(device)
reg(emb.unsqueeze(0).to(device), series_.unsqueeze(0).to(device)).shape

torch.Size([60, 75, 5]) torch.Size([60, 75]) torch.Size([56, 75])


torch.Size([1, 56, 75])

# Training 

In [19]:
def SMAPE(y_pred, y):
    res = 2*(y - y_pred).abs() / (y.abs() + y_pred.abs() + 1e-5)
    return res.sum() / torch.prod(torch.Tensor(np.array(res.shape)))

In [20]:
from torch.optim.lr_scheduler import StepLR


def trainer(train_loader, valid_loader, model, config, device):
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate']) 
    scheduler = StepLR(optimizer, step_size=10, gamma=0.8)
    save_path = config['save_path']
    if not os.path.isdir('./models'):
        os.mkdir('./models')
    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
    for epoch in range(n_epochs):
        model.train()
        loss_record = []
        mape_record = []
        train_pbar = tqdm(train_loader, position=0, leave=True)
        for emb, series_, y in train_pbar:
            optimizer.zero_grad()             
            emb, series_, y = emb.to(device), series_.to(device), y.to(device)  
            pred = model(emb, series_)
            loss = criterion(pred, y)
            smape = SMAPE(pred.cpu().detach(), y.cpu().detach())
            loss.backward()                   

            optimizer.step()    
            step += 1
            l_ = loss.detach().item()
            loss_record.append(l_)
            mape_record.append(smape)
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': f'{l_:.5f}', "smape" : f'{smape:.5f}'})
        
        scheduler.step()
        mean_train_loss = sum(loss_record)/len(loss_record)
        mean_train_mape = sum(mape_record)/len(mape_record)
        # model.eval() # 设置模型为评估模式
        loss_record = []
        mape_record = []
        for emb, series_, y in valid_loader:
            emb, series_, y = emb.to(device), series_.to(device), y.to(device)  
            with torch.no_grad():
                pred = model(emb, series_)
                loss = criterion(pred, y)
                smape = SMAPE(pred.cpu().detach(), y.cpu().detach())

            loss_record.append(loss.item())
            mape_record.append(smape)
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        mean_valid_mape = sum(mape_record)/len(mape_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train SMAPE: {mean_train_mape:.4f} Loss: {mean_train_loss:.5f}, Valid SMAPE: {mean_valid_mape:.4f} Loss: {mean_valid_loss:.5f}')

        if mean_valid_loss < best_loss: # mean_valid_loss
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), save_path) 
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

In [21]:
config = {
    'seed': 6666,
    'n_epochs': 20,      
    'learning_rate': 2e-3,#1e-4,           
    'early_stop': 300,
    'save_path': './models/model.ckpt'
}
print(device)

cuda


In [22]:
reg = regNet(PRED_LEN)
trainer(tr_dataloader, val_dataloader, reg, config, device)

  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [1/20]: Train SMAPE: 1.7627 Loss: 47242.41619, Valid SMAPE: 1.5506 Loss: 52277.82448
Saving model with loss 52277.824...


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [2/20]: Train SMAPE: 1.3220 Loss: 34065.22896, Valid SMAPE: 1.2096 Loss: 36192.95312
Saving model with loss 36192.953...


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [3/20]: Train SMAPE: 1.1130 Loss: 26303.09714, Valid SMAPE: 1.1736 Loss: 32861.38490
Saving model with loss 32861.385...


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [4/20]: Train SMAPE: 1.0787 Loss: 25684.40858, Valid SMAPE: 1.1685 Loss: 32757.00039
Saving model with loss 32757.000...


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [5/20]: Train SMAPE: 1.0772 Loss: 25648.15252, Valid SMAPE: 1.1737 Loss: 32666.45859
Saving model with loss 32666.459...


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [6/20]: Train SMAPE: 1.0808 Loss: 25652.03194, Valid SMAPE: 1.1807 Loss: 32737.59818


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [7/20]: Train SMAPE: 1.0784 Loss: 25642.57048, Valid SMAPE: 1.1920 Loss: 32795.92161


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [8/20]: Train SMAPE: 1.0789 Loss: 25633.61582, Valid SMAPE: 1.1923 Loss: 32935.71745


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [9/20]: Train SMAPE: 1.0780 Loss: 25619.77515, Valid SMAPE: 1.1895 Loss: 32906.20729


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [10/20]: Train SMAPE: 1.0740 Loss: 25605.22300, Valid SMAPE: 1.1963 Loss: 32964.00339


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [11/20]: Train SMAPE: 1.0712 Loss: 25591.57271, Valid SMAPE: 1.1929 Loss: 33036.81771


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [12/20]: Train SMAPE: 1.0691 Loss: 25583.86451, Valid SMAPE: 1.1961 Loss: 33073.07474


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [13/20]: Train SMAPE: 1.0681 Loss: 25579.34481, Valid SMAPE: 1.1935 Loss: 33120.10781


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [14/20]: Train SMAPE: 1.0660 Loss: 25574.03941, Valid SMAPE: 1.1894 Loss: 33121.64271


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [15/20]: Train SMAPE: 1.0643 Loss: 25570.89261, Valid SMAPE: 1.1901 Loss: 33136.57266


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [16/20]: Train SMAPE: 1.0629 Loss: 25568.18795, Valid SMAPE: 1.1914 Loss: 33133.31120


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [17/20]: Train SMAPE: 1.0618 Loss: 25566.74883, Valid SMAPE: 1.1922 Loss: 33145.89245


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [18/20]: Train SMAPE: 1.0605 Loss: 25564.77724, Valid SMAPE: 1.1924 Loss: 33196.31823


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [19/20]: Train SMAPE: 1.0593 Loss: 25563.58748, Valid SMAPE: 1.1941 Loss: 33224.69349


  0%|          | 0/57 [00:00<?, ?it/s]

Epoch [20/20]: Train SMAPE: 1.0584 Loss: 25562.63949, Valid SMAPE: 1.1889 Loss: 33153.26875


# forecast

In [23]:
te_df['country_id'] = te_df['country'].map(country_str2code)
te_df['store_id'] = te_df['store'].map(store_str2code)
te_df['product_id'] = te_df['product'].map(product_str2code)
te_df['date_time'] = pd.to_datetime(te_df['date'], format="%Y-%m-%d")
te_df['tp_id'] = te_df.country_id.map(str) + '&&'\
                + te_df.store_id.map(str)   + "&&"\
                + te_df['product_id'].map(str) + "&&"\
                + te_df['date_time'].dt.day.map(str) + "&&"\
                + te_df['date_time'].dt.dayofweek.map(str) 
te_df['tp'] = te_df.country + '&&' + te_df.store  + "&&" + te_df['product']

In [24]:
tr_df_cp = tr_df.copy(deep=True).set_index('date_time')

In [25]:
reg = regNet(PRED_LEN)
reg.load_state_dict(torch.load(config['save_path'], map_location='cpu'))
reg.to(device)

pred_statr_dt = te_df.date_time.min() - timedelta(days=60)
tt_pred_df = pd.concat([tr_df_cp.loc[pred_statr_dt:, :], te_df.set_index('date_time')])
tt_pred_df = tt_pred_df.reset_index()
tt_pred_df['c'] = tt_pred_df['tp'] + "&&" + tt_pred_df['date_time'].dt.strftime('%Y%m%d')
tt_pred_df = tt_pred_df.set_index('date_time')

his_len = 60
groupby_cols = ['date_time', 'tp', 'tp_id']
forcast_df_final = pd.DataFrame(columns=['date_time', 'tp', 'tp_id', 'num_sold'])
tqd_bar = tqdm(range(365 - PRED_LEN + 1))
for d in tqd_bar:
    pred_ipt_st = pred_statr_dt + timedelta(days=d)
    tqd_bar.set_description(f'start_dt={pred_ipt_st}')
    pred_ipt_ed = pred_ipt_st + timedelta(days=59)
    pred_f_ed = pred_ipt_ed + timedelta(days=PRED_LEN)
    if d:
        # update num_sold
        tmp_mg = tt_pred_df.reset_index().merge(forcast_df_final.rename(columns={"num_sold": 'pred'})[['c', 'pred']], how='left', on='c')
        pred_bool = (~tmp_mg.pred.isna()) & (tmp_mg['date_time'] >= te_df.date_time.min())
        tmp_mg.loc[pred_bool, 'num_sold'] = tmp_mg[pred_bool].apply(
            lambda c: c['pred'] if np.isnan(c['num_sold']) else (c['num_sold']+ c['pred'])/2, axis=1)
        tt_pred_df = tmp_mg.drop(columns='pred').set_index('date_time')
    sl = [pred_ipt_st, pred_ipt_ed, pred_f_ed]
    pred_ipt = tt_pred_df.loc[sl[0]:sl[1], ['tp', 'tp_id', 'num_sold']].reset_index().sort_values(by=['date_time', 'tp'], ignore_index=True)
    aa = pred_ipt[['tp_id']].values.reshape(-1, 75)[0]
    series = torch.Tensor(pred_ipt[['num_sold']].values.reshape(-1, 75)).float()
    emb_ = np.stack([pd.DataFrame(aa)[0].str.split('&&', expand=True).values.astype(int) for _ in range(his_len)])
    emb_ = torch.Tensor(emb_).long()
    with torch.no_grad():
        pred_res = reg(emb_.unsqueeze(0).to(device), series.unsqueeze(0).to(device))
    pred_flatten = pred_res[0].T.reshape(PRED_LEN*75).cpu().detach().numpy()
    forcast_df = tt_pred_df.loc[sl[1] + timedelta(1):sl[2], ['tp', 'tp_id', 'num_sold']].reset_index().sort_values(by=['date_time', 'tp'], ignore_index=True)
    forcast_df['num_sold'] = pred_flatten
    
    forcast_df_final = pd.concat([forcast_df_final, forcast_df])
    forcast_df_final = forcast_df_final.groupby(groupby_cols, as_index=False)['num_sold'].mean()
    forcast_df_final['c'] = forcast_df_final['tp'] + "&&" + forcast_df_final['date_time'].dt.strftime('%Y%m%d')

  0%|          | 0/310 [00:00<?, ?it/s]

In [26]:
sub_df = pd.read_csv(file_root.joinpath('sample_submission.csv'))

In [27]:
te_df['c'] = te_df['tp'] + "&&" + te_df['date_time'].dt.strftime('%Y%m%d')
sub_df = sub_df[['id']].merge(
    te_df.merge(forcast_df_final[['c', 'num_sold']], how='left', on='c')[['id', 'num_sold']],
    on = 'id'
)

sub_df['num_sold'] = np.round(sub_df['num_sold']).map(int)
sub_df.to_csv('submission.csv', index=False)
!head submission.csv

id,num_sold
136950,259
136951,0
136952,219
136953,207
136954,0
136955,0
136956,0
136957,248
136958,218
