In [1]:
from os import path
from pathlib import Path
import pandas as pd

data_dir = path.join(Path().resolve(), "data")
dataset_dir = path.join(data_dir, "dataset")

csv_train = pd.read_csv(path.join(dataset_dir, "train.csv"))
csv_test = pd.read_csv(path.join(dataset_dir, "test.csv"))
csv_clothing_master = pd.read_csv(path.join(dataset_dir, "clothing_master.csv"))

In [2]:
import torch
from torch.utils import data
import lightning as L

L.seed_everything(42, workers=True)


def preprocess(x: pd.DataFrame) -> pd.DataFrame:
    x = x.copy()
    x = x.merge(csv_clothing_master, on="Clothing ID")
    x = x.drop(
        columns=[
            "Title",
            "Review Text",
            "Division Name",
            "Department Name",
            "Class Name",
        ]
    )
    return x


train = preprocess(csv_train)
test = preprocess(csv_test)

train_x, train_y = (
    train.drop(columns=["Recommended IND", "Rating"]),  # TODO: utilize "Rating"
    train["Recommended IND"],
)

train_val = data.TensorDataset(
    torch.tensor(train_x.values, dtype=torch.float32),
    torch.tensor(train_y, dtype=torch.float32),
)

train_size = int(len(train_val) * 0.9)
val_size = len(train_val) - train_size
dataset_train, dataset_val = data.random_split(
    train_val, lengths=[train_size, val_size]
)
dataset_test = data.TensorDataset(torch.tensor(test.values, dtype=torch.float32))

Seed set to 42


In [3]:
from torch import optim, nn
from torch.optim import optimizer


class LightningModel(L.LightningModule):
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model
        self.loss_fn = nn.functional.mse_loss
        self.save_hyperparameters()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

    def configure_optimizers(self) -> optimizer.Optimizer:
        return optim.Adam(self.parameters(), lr=1e-3)

    def training_step(self, batch: list[torch.Tensor]) -> torch.Tensor:
        inputs, target = batch
        output = self(inputs)
        loss = self.loss_fn(output, target)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch: list[torch.Tensor]) -> torch.Tensor:
        inputs, target = batch
        output = self(inputs)
        loss = self.loss_fn(output, target)
        self.log("val_loss", loss)
        return loss

    def predict_step(self, batch: list[torch.Tensor]) -> torch.Tensor:
        inputs, *_ = batch
        output = self(inputs)
        return output


In [4]:
model = LightningModel(
    nn.Sequential(nn.Linear(len(train_x.columns), 64), nn.ReLU(), nn.Linear(64, 1))
)
trainer = L.Trainer(accelerator="gpu", max_epochs=2, log_every_n_steps=10, devices=1)

trainer.fit(model, train_dataloaders=dataset_train, val_dataloaders=dataset_val)

/home/kira/ghq/github.com/ras0q/atmaCup17/.venv/lib/python3.12/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/kira/ghq/github.com/ras0q/atmaCup17/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA devi

                                                                           

  loss = self.loss_fn(output, target)
/home/kira/ghq/github.com/ras0q/atmaCup17/.venv/lib/python3.12/site-packages/lightning/pytorch/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 3. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 0:   0%|          | 6/9000 [00:00<02:18, 64.76it/s, v_num=10]

  loss = self.loss_fn(output, target)


Epoch 1: 100%|██████████| 9000/9000 [00:42<00:00, 210.67it/s, v_num=10]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 9000/9000 [00:42<00:00, 210.64it/s, v_num=10]


In [6]:
outputs = trainer.predict(model, dataloaders=dataset_test)
assert outputs is not None

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 11155/11155 [00:11<00:00, 950.68it/s]


In [9]:
import csv
import time

with open(path.join(data_dir, f"submit_{int(time.time())}.csv"), "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["target"])
    writer.writerows([x.tolist() for x in outputs])