<a href="https://colab.research.google.com/github/sardaharsh/AIDS_assignment2/blob/main/LSTM_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'walmart-sales-forecast:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2107830%2F3502310%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241012%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241012T135532Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D8cde4e3400e701f3f546c6586fa449bfdcaabd6a433f5f4b266352fa14da24d26db090a72a31f71a65c8035eda7998e250269de0a2b6df1bcc42547f4428b0cadc9bc93a9f322957ec92fde87a285d6ee61187b98f9e83d97c06bb5a0cd46d043c2b9dc1fe84818a01f98a74c3d092f042df3fdb3faf696af96c1e9caa5692a59ed05e9306d6ebf50238fcc7c1e90d817ca8f5becfbfff1e58c712c7cf486df4c7dd524c005b1b68b5040cdc8502afed1b00373c8012cca930d1431a43ae792d953a9003a74f1b2e98c990399591d43508e48f3ad60aac3811a78351cca7fbbc3fa873b36c8ba0e7e9ee4a87473d37d1b5b81a37a3607d89a710e7c6140a0a43'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading walmart-sales-forecast, 3524213 bytes compressed
Downloaded and uncompressed: walmart-sales-forecast
Data source import complete.


In [2]:
!pip install --upgrade lightning wandb

Collecting lightning
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting wandb
  Downloading wandb-0.18.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.7-py3-none-any.whl.metadata (5.2 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.4.3-py3-none-any.whl.metadata (19 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.16.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp31

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import lightning as pl
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import WandbLogger
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import wandb
from lightning.pytorch.loggers import WandbLogger

In [4]:
train = pd.read_csv("/kaggle/input/walmart-sales-forecast/train.csv")

In [5]:
train

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.50,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.90,False
...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False
421566,45,98,2012-10-05,628.10,False
421567,45,98,2012-10-12,1061.02,False
421568,45,98,2012-10-19,760.01,False


In [6]:
train = train[train["Store"] == 1]

In [7]:
features = pd.read_csv("/kaggle/input/walmart-sales-forecast/features.csv")

In [8]:
features

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.242170,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.50,2.625,,,,,,211.350143,8.106,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8185,45,2013-06-28,76.05,3.639,4842.29,975.03,3.00,2449.97,3169.69,,,False
8186,45,2013-07-05,77.50,3.614,9090.48,2268.58,582.74,5797.47,1514.93,,,False
8187,45,2013-07-12,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,,,False
8188,45,2013-07-19,82.84,3.737,2961.49,1047.07,204.19,363.00,1059.46,,,False


In [9]:
features = features[features["Store"] == 1]

In [10]:
train = train.merge(features, on="Date")

In [11]:
train

Unnamed: 0,Store_x,Dept,Date,Weekly_Sales,IsHoliday_x,Store_y,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,2010-02-05,24924.50,False,1,42.31,2.572,,,,,,211.096358,8.106,False
1,1,1,2010-02-12,46039.49,True,1,38.51,2.548,,,,,,211.242170,8.106,True
2,1,1,2010-02-19,41595.55,False,1,39.93,2.514,,,,,,211.289143,8.106,False
3,1,1,2010-02-26,19403.54,False,1,46.63,2.561,,,,,,211.319643,8.106,False
4,1,1,2010-03-05,21827.90,False,1,46.50,2.625,,,,,,211.350143,8.106,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10239,1,99,2012-08-31,20.06,False,1,80.49,3.638,21442.73,102.32,21.70,13209.64,3032.96,222.305480,6.908,False
10240,1,99,2012-09-07,0.05,True,1,83.96,3.730,5204.68,35.74,50.94,4120.32,2737.17,222.439015,6.908,True
10241,1,99,2012-09-14,0.03,False,1,74.97,3.717,17212.52,7.00,18.79,1523.11,7992.72,222.582019,6.908,False
10242,1,99,2012-10-05,635.00,False,1,68.55,3.617,8077.89,,18.22,3617.43,3626.14,223.181477,6.573,False


In [12]:
train.isna().sum()

Unnamed: 0,0
Store_x,0
Dept,0
Date,0
Weekly_Sales,0
IsHoliday_x,0
Store_y,0
Temperature,0
Fuel_Price,0
MarkDown1,6587
MarkDown2,7229


In [13]:
train = train.fillna(0)

In [14]:
train

Unnamed: 0,Store_x,Dept,Date,Weekly_Sales,IsHoliday_x,Store_y,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,2010-02-05,24924.50,False,1,42.31,2.572,0.00,0.00,0.00,0.00,0.00,211.096358,8.106,False
1,1,1,2010-02-12,46039.49,True,1,38.51,2.548,0.00,0.00,0.00,0.00,0.00,211.242170,8.106,True
2,1,1,2010-02-19,41595.55,False,1,39.93,2.514,0.00,0.00,0.00,0.00,0.00,211.289143,8.106,False
3,1,1,2010-02-26,19403.54,False,1,46.63,2.561,0.00,0.00,0.00,0.00,0.00,211.319643,8.106,False
4,1,1,2010-03-05,21827.90,False,1,46.50,2.625,0.00,0.00,0.00,0.00,0.00,211.350143,8.106,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10239,1,99,2012-08-31,20.06,False,1,80.49,3.638,21442.73,102.32,21.70,13209.64,3032.96,222.305480,6.908,False
10240,1,99,2012-09-07,0.05,True,1,83.96,3.730,5204.68,35.74,50.94,4120.32,2737.17,222.439015,6.908,True
10241,1,99,2012-09-14,0.03,False,1,74.97,3.717,17212.52,7.00,18.79,1523.11,7992.72,222.582019,6.908,False
10242,1,99,2012-10-05,635.00,False,1,68.55,3.617,8077.89,0.00,18.22,3617.43,3626.14,223.181477,6.573,False


In [15]:
train.columns.to_list()

['Store_x',
 'Dept',
 'Date',
 'Weekly_Sales',
 'IsHoliday_x',
 'Store_y',
 'Temperature',
 'Fuel_Price',
 'MarkDown1',
 'MarkDown2',
 'MarkDown3',
 'MarkDown4',
 'MarkDown5',
 'CPI',
 'Unemployment',
 'IsHoliday_y']

## Split on train and val

In [16]:
val = train[train["Date"] >= "2012-01-01"]
train = train[train["Date"] < "2012-01-01"]

In [17]:
train = train.drop(columns=["Date"]).astype("float")
val = val.drop(columns=["Date"]).astype("float")

## Scale data

In [18]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train = scaler.fit_transform(train)
val = scaler.transform(val)

In [19]:
train

array([[0.        , 0.        , 0.12607961, ..., 0.0825055 , 1.        ,
        0.        ],
       [0.        , 0.        , 0.2293145 , ..., 0.09835706, 1.        ,
        1.        ],
       [0.        , 0.        , 0.2075873 , ..., 0.10346363, 1.        ,
        0.        ],
       ...,
       [0.        , 1.        , 0.01062418, ..., 0.96123998, 0.43396226,
        0.        ],
       [0.        , 1.        , 0.01331322, ..., 0.98062   , 0.43396226,
        0.        ],
       [0.        , 1.        , 0.00617503, ..., 1.        , 0.43396226,
        1.        ]])

In [20]:
class TimeSeriesDataset(torch.utils.data.Dataset):
    def __init__(self, data, seq_len):
        self.seq_len = seq_len
        self.data = []
        self.targets = []
        data = pd.DataFrame(data)
        data.columns = ['Store_x','Dept','Weekly_Sales','IsHoliday_x','Store_y','Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5','CPI','Unemployment','IsHoliday_y']
        data_parts = data.groupby(["Store_x", "Dept"])
        for idx, data in data_parts:
            for i in range(data.shape[0] - self.seq_len):
                self.data.append(torch.tensor(data[i:i+self.seq_len].drop(columns=["Weekly_Sales", "Store_y"]).values).to(torch.float32))
                self.targets.append(torch.tensor(data.iloc[i+self.seq_len]["Weekly_Sales"]).to(torch.float32))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

In [21]:
train_dataset = TimeSeriesDataset(train, 20)
val_dataset = TimeSeriesDataset(val, 20)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

In [22]:
train_dataset[0]

(tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 1.2284e-01, 4.1637e-02, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 8.2506e-02, 1.0000e+00,
          0.0000e+00],
         [0.0000e+00, 0.0000e+00, 1.0000e+00, 5.5289e-02, 2.4408e-02, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 9.8357e-02, 1.0000e+00,
          1.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 8.0533e-02, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0346e-01, 1.0000e+00,
          0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 1.9964e-01, 3.3740e-02, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0678e-01, 1.0000e+00,
          0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 1.9733e-01, 7.9684e-02, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.1010e-01, 1.0000e+00,
          0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 3.9804e-01, 1.0983e

In [23]:
class LSTMModel(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

    def training_step(self, batch, batch_idx):
        data, targets = batch
        outputs = self(data)
        loss = nn.MSELoss()(outputs, targets)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        data, targets = batch
        outputs = self(data)
        loss = nn.MSELoss()(outputs, targets)
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0002)

In [24]:
model = LSTMModel(input_dim=13, hidden_dim=50, num_layers=1, dropout=0.2, output_dim=1)

wandb_logger = WandbLogger(project='time_series_forecasting')

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='/kaggle/working/checkpoints/',
    filename='best-checkpoint',
    save_top_k=1,
    mode='min'
)

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=5,
    mode='min'
)

trainer = pl.Trainer(
    max_epochs=50,
    logger=wandb_logger,
    callbacks=[checkpoint_callback, early_stopping_callback]
)

trainer.fit(model, train_dataloader, val_dataloader)

INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


INFO: 
  | Name | Type   | Params | Mode 
----------------------------------------
0 | lstm | LSTM   | 13.0 K | train
1 | fc   | Linear | 51     | train
----------------------------------------
13.1 K    Trainable params
0         Non-trainable params
13.1 K    Total params
0.052     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode
INFO:lightning.pytorch.callbacks.model_summary:
  | Name | Type   | Params | Mode 
----------------------------------------
0 | lstm | LSTM   | 13.0 K | train
1 | fc   | Linear | 51     | train
----------------------------------------
13.1 K    Trainable params
0         Non-trainable params
13.1 K    Total params
0.052     Total estimated model params size (MB)
2         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


Training: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


Validation: |          | 0/? [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [25]:
wandb.finish()

VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇█████
train_loss,▁▄▃▂▃▂▂▄▁▂▃▃▁▃▇▄▃▁▅▂▂▃▅▂▁█▃▃
trainer/global_step,▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
val_loss,▂█▁▃▃▄▃▁

0,1
epoch,7.0
train_loss,0.0159
trainer/global_step,1423.0
val_loss,0.0197
