In [1]:
from os import path
from pathlib import Path
import pandas as pd

data_dir = path.join(Path().resolve(), "data")
dataset_dir = path.join(data_dir, "dataset")

csv_train = pd.read_csv(path.join(dataset_dir, "train.csv"))
csv_test = pd.read_csv(path.join(dataset_dir, "test.csv"))
csv_clothing_master = pd.read_csv(path.join(dataset_dir, "clothing_master.csv"))

In [2]:
for col in ["Division Name", "Department Name", "Class Name"]:
    df_encoded = pd.get_dummies(
        csv_clothing_master[col].astype(str), prefix=col
    ).astype(int)
    csv_clothing_master = csv_clothing_master.drop(col, axis=1)
    csv_clothing_master = pd.concat([csv_clothing_master, df_encoded], axis=1)


def preprocess(x: pd.DataFrame) -> pd.DataFrame:
    x = x.copy()
    x = x.merge(csv_clothing_master, on="Clothing ID")
    x = x.drop(
        [
            "Title",
            "Review Text",
        ],
        axis=1,
    )
    return x


train = preprocess(csv_train)
test = preprocess(csv_test)

train_x, train_y = (
    train.drop(["Recommended IND", "Rating"], axis=1),  # TODO: utilize "Rating"
    train["Recommended IND"],
)


In [3]:
import torch
from torch.utils import data
import lightning as L

L.seed_everything(42, workers=True)

train_val = data.TensorDataset(
    torch.tensor(train_x.values, dtype=torch.float32),
    torch.tensor(train_y, dtype=torch.float32),
)

train_size = int(len(train_val) * 0.9)
val_size = len(train_val) - train_size
dataset_train, dataset_val = data.random_split(
    train_val, lengths=[train_size, val_size]
)
dataset_test = data.TensorDataset(torch.tensor(test.values, dtype=torch.float32))

Seed set to 42


In [4]:
from torch import optim, nn
from torch.optim import optimizer


class LightningModel(L.LightningModule):
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model
        self.loss_fn = nn.functional.mse_loss
        self.save_hyperparameters()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

    def configure_optimizers(self) -> optimizer.Optimizer:
        return optim.Adam(self.parameters(), lr=1e-3)

    def training_step(self, batch: list[torch.Tensor]) -> torch.Tensor:
        inputs, target = batch
        output = self(inputs)
        loss = self.loss_fn(output, target)
        self.log(
            "train_loss",
            loss,
            prog_bar=True,
            sync_dist=True,
            on_step=True,
            on_epoch=False,
        )
        return loss

    def validation_step(self, batch: list[torch.Tensor]) -> torch.Tensor:
        inputs, target = batch
        output = self(inputs)
        loss = self.loss_fn(output, target)
        self.log(
            "val_loss",
            loss,
            prog_bar=True,
            sync_dist=True,
            on_step=True,
            on_epoch=True,
        )
        return loss

    def predict_step(self, batch: list[torch.Tensor]) -> torch.Tensor:
        inputs, *_ = batch
        output = self(inputs)
        return output


In [5]:
from lightning.pytorch.loggers import WandbLogger
import wandb

wandb.login()
logger = WandbLogger(name="atmaCup17", project="atmaCup17", log_model="all")

model = LightningModel(
    nn.Sequential(nn.Linear(len(train_x.columns), 64), nn.ReLU(), nn.Linear(64, 1))
)

trainer = L.Trainer(
    accelerator="gpu",
    max_epochs=20,
    log_every_n_steps=10,
    devices=1,
    logger=logger,
)

batch_size = 32
num_workers = 2
trainer.fit(
    model,
    train_dataloaders=data.DataLoader(
        dataset_train, batch_size=batch_size, num_workers=num_workers, pin_memory=True
    ),
    val_dataloaders=data.DataLoader(
        dataset_val, batch_size=batch_size, num_workers=num_workers, pin_memory=True
    ),
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mras0q[0m ([33mras0q-team[0m). Use [1m`wandb login --relogin`[0m to force relogin
/home/kira/ghq/github.com/ras0q/atmaCup17/.venv/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 1.9 K  | train
---------------------------------------------
1.9 K     Trainable params
0         Non-trainable params
1.9 K     Total params
0.008     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


                                                                           

  loss = self.loss_fn(output, target)


Epoch 0:   0%|          | 0/282 [00:00<?, ?it/s] 

  loss = self.loss_fn(output, target)


Epoch 0: 100%|██████████| 282/282 [00:01<00:00, 197.19it/s, v_num=ewhg, train_loss=0.209] 

  loss = self.loss_fn(output, target)


Epoch 0: 100%|██████████| 282/282 [00:01<00:00, 174.21it/s, v_num=ewhg, train_loss=0.209, val_loss_step=0.177, val_loss_epoch=0.200]

  loss = self.loss_fn(output, target)


Epoch 19: 100%|██████████| 282/282 [00:01<00:00, 148.96it/s, v_num=ewhg, train_loss=0.189, val_loss_step=0.154, val_loss_epoch=0.167] 

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 282/282 [00:02<00:00, 136.51it/s, v_num=ewhg, train_loss=0.189, val_loss_step=0.154, val_loss_epoch=0.167]


In [6]:
outputs = trainer.predict(
    model,
    data.DataLoader(
        dataset_test, batch_size=batch_size, num_workers=num_workers, pin_memory=True
    ),
)
assert outputs is not None

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 349/349 [00:00<00:00, 402.85it/s]


In [7]:
import csv
import time
import numpy as np

with open(path.join(data_dir, f"submit_{int(time.time())}.csv"), "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["target"])
    writer.writerows([[np.mean(x.tolist())] for x in outputs])