I just wanted to practice implementing GANs, I don't know if GANs are superior or if GANs are implemented correctly.

Please don't feel bad.

If there are any oddities in this notebook, I would appreciate your pointing them out.

In [None]:
# For Google Colab
"""
from google.colab import drive
drive.mount('/content/drive')

# Install kaggle packages
!pip install -q kaggle
!pip install -q kaggle-cli

# Lib
from google.colab import files

# Please Upload `kaggle.json` file
uploaded = files.upload()

# Then copy kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

!kaggle competitions download -c tabular-playground-series-jun-2022
!unzip -o tabular-playground-series-jun-2022.zip -d tabular-playground-series-jun-2022
# !kaggle kernels output ehekatlact/tps2206-pytorch-lightning-fine-tuning2 -p ./DataSet
"""

In [None]:
%%capture
!pip install wandb
!pip install pytorch_lightning

In [None]:
import wandb
try:
    # add-ons -> secrets -> set your wandb api key
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("wandb_api")
    wandb.login(key=secret_value_0)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

### Lib

In [None]:
# common
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import time, gc, string, math
from tqdm.notebook import tqdm
import warnings
import shutil
from collections import defaultdict
import heapq
import datetime
import random
from collections import OrderedDict
import glob
import copy

# sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch.optim import lr_scheduler

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger


In [None]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [None]:
# for google colab
# os.chdir("/content/drive/MyDrive/colab_data/TPS2206")

In [None]:
"""
os.makedirs('model', exist_ok=True)
shutil.rmtree('./model/')
os.makedirs('model', exist_ok=True)
"""

### Read DF

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')
sub = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')

In [None]:
for col in data.columns:
    if "F_4" not in col:
        data[col] = data[col].fillna(data[col].mean())

In [None]:
na_col_list = []
for col in data.columns:
    if data[col].isna().sum() != 0:
        na_col_list.append(col)

In [None]:
na_index_of = {}
for col in na_col_list:
    na_index = list(np.where(data[col].isnull())[0])
    na_index_of[col] = na_index

In [None]:
cat_col_list = []
val_col_list = []
le = LabelEncoder()
for col in data.columns:
    if data[col].nunique() < 32:
        cat_col_list.append(col)
        data[col] = le.fit_transform(data[col])
    else:
        val_col_list.append(col)

In [None]:
data.fillna(data.mean(), inplace=True)

## Pytorch

### DataSet and DataLoader

In [None]:
class TrainDataset(Dataset):
    def __init__(self, X_val, X_cat, X_na, na_col_list=None):
        self.X_val = X_val
        self.X_cat = X_cat
        self.X_na = X_na
        self.na_col_list = na_col_list

    def __len__(self):
        return len(self.X_na)

    def __getitem__(self, item):
        # pre_val_inputs = copy.deepcopy(self.X_val[item])
        pre_val_inputs = self.X_val[item].copy()
        i = 0
        if self.na_col_list is not None:
            # val_inputsのうちいずれか1つを乱数で埋める
            # x = -1
            x = np.random.randn()
            i = random.randrange(0, len(na_col_list))
            pre_val_inputs[self.na_col_list[i]] = x
        val_inputs = torch.tensor(pre_val_inputs, dtype=torch.float32)
        cat_inputs = torch.tensor(self.X_cat[item], dtype=torch.int32)
        outputs = torch.tensor(self.X_na[item], dtype=torch.float32)
        zeros = torch.zeros(1, dtype=torch.float32)
        ones = torch.ones(1, dtype=torch.float32)

        return val_inputs, cat_inputs, outputs, zeros, ones, i

In [None]:
data_val = data.loc[:, val_col_list].values
data_cat = data.loc[:, cat_col_list].values
data_na = data.loc[:, na_col_list].values
ds = TrainDataset(data_val, data_cat, data_na, na_col_list)
for v, c, o, zero, ones, i in ds:
    break
del data_val, data_cat, data_na
gc.collect()

In [None]:
class DataModule(pl.LightningDataModule):
    # train, val, testの3つのDataLoaderを定義する
    # trainerにこれを渡すと、train, val, testのそれぞれのステップでこれを渡してくれる
    def __init__(self, train, valid, cat_col_list, val_col_list, na_col_list, batch_size):
        self.train_val = train.loc[:, val_col_list].values
        self.valid_val = valid.loc[:, val_col_list].values
        self.train_cat = train.loc[:, cat_col_list].values
        self.valid_cat = valid.loc[:, cat_col_list].values
        self.train_na = train.loc[:, na_col_list].values
        self.valid_na = valid.loc[:, na_col_list].values
        self.batch_size = batch_size
        self.na_col_list = [val_col_list.index(col) for col in na_col_list]
        self._log_hyperparams = None  # ナニコレ・・・

    def train_dataloader(self):
        ds = TrainDataset(self.train_val, self.train_cat, self.train_na, self.na_col_list)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=True, pin_memory=True, drop_last=True, num_workers=CFG.num_workers, persistent_workers=True)
        return dl

    def val_dataloader(self):
        ds = TrainDataset(self.valid_val, self.valid_cat, self.valid_na)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=False, num_workers=CFG.num_workers, persistent_workers=True)
        return dl

    def predict_dataloader(self):
        ds = TrainDataset(self.valid_val, self.valid_cat, self.valid_na)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=False, num_workers=CFG.num_workers, persistent_workers=True)
        return dl

    def prepare_data_per_node(self):
        # TODO 本来要らないはずなんだけど・・・
        pass

    def teardown(self, stage=None):
        torch.cuda.empty_cache()  # TODO: これであってるのか不明　何も出てこないんだよね
        gc.collect()

### Pytorch Model

In [None]:
class Generator(nn.Module):
    def __init__(self, val_input_size, cat_input_size, output_size):
        super().__init__()
        hidden_size = 100
        emb_dim = 8
        self.cat_input_size = cat_input_size
        _emb_list = [nn.Embedding(32, emb_dim) for _ in range(cat_input_size)]
        self.emb_list = nn.ModuleList(_emb_list)
        self.fc1 = nn.Linear(val_input_size+emb_dim*cat_input_size, hidden_size*4)
        self.bn1 = nn.BatchNorm1d(hidden_size*4)
        self.fc2 = nn.Linear(hidden_size*4, hidden_size*4)
        self.fc3 = nn.Linear(hidden_size*4, hidden_size*2)
        self.fc4 = nn.Linear(hidden_size*2, hidden_size)
        self.fc5 = nn.Linear(hidden_size, output_size)
    
    def forward(self, val_x, cat_x):
        # dropoutとbnの併用禁止
        # bnは活性化関数の前に
        embbed_list = []
        for i in range(self.cat_input_size):
            emb = self.emb_list[i]
            embbed_list.append(emb(cat_x[:, i]))
        embbed = torch.cat(embbed_list, dim=1)
        x = torch.cat([val_x, embbed], dim=1)
        x = F.silu(self.bn1((self.fc1(x))))
        x = F.silu(self.fc2(x))
        x = F.silu(self.fc3(x))
        x = F.silu(self.fc4(x))
        x = self.fc5(x)
        return x

In [None]:
class Discriminator(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        hidden_size = 50
        self.fc1 = nn.Linear(input_size, hidden_size*8)
        self.bn1 = nn.BatchNorm1d(hidden_size*8)
        self.fc2 = nn.Linear(hidden_size*8, hidden_size*4)
        self.fc3 = nn.Linear(hidden_size*4, hidden_size*2)
        self.fc4 = nn.Linear(hidden_size*2, hidden_size)
        self.fc5 = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # dropoutとbnの併用禁止
        # bnは活性化関数の前に
        x = F.silu(self.bn1((self.fc1(x))))
        x = F.silu(self.fc2(x))
        x = F.silu(self.fc3(x))
        x = F.silu(self.fc4(x))
        x = torch.sigmoid(self.fc5(x))
        return x

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, yhat, y):
        return torch.sqrt(self.mse(yhat,y))

In [None]:
class NNModel(pl.LightningModule):
    # https://pytorch-lightning.readthedocs.io/en/stable/notebooks/lightning_examples/basic-gan.html
    def __init__(self, generator: nn.Module, discriminator: nn.Module):
        super().__init__()
        self.generator = generator
        self.discriminator = discriminator
        self.criterion_gen = RMSELoss()
        self.criterion_dis = nn.BCELoss()
        self.lr = CFG.lr

    def forward(self, val_x, cat_x) -> torch.Tensor:
        gen = self.generator(val_x, cat_x)
        return gen

    # Setup Optimizer and Scheduler
    def configure_optimizers(self):
        gen_model_params = [p for n, p in self.generator.named_parameters()]
        dis_model_params = [p for n, p in self.discriminator.named_parameters()]
        gen_optimizer_params = [
            {"params":  gen_model_params,
             "weight_decay": CFG.weight_decay,
             "lr": 1e-3
            },
        ]
        dis_optimizer_params = [
            {"params":  dis_model_params,
             "weight_decay": CFG.weight_decay,
             "lr": 1e-2
            },
        ]

        gen_optimizer = optim.Adam(gen_optimizer_params)
        dis_optimizer = optim.Adam(dis_optimizer_params)

        gen_scheduler = lr_scheduler.CosineAnnealingLR(dis_optimizer,
                                                T_max=2000,
                                                eta_min=1e-5,
                                                )
        interval = "step"

        dis_scheduler = lr_scheduler.CosineAnnealingLR(gen_optimizer,
                                                T_max=3000,
                                                eta_min=1e-5,
                                                )
        interval = "step"

        return [gen_optimizer, dis_optimizer], [{"scheduler": gen_scheduler, "interval": interval}, {"scheduler": dis_scheduler, "interval": interval}]

    # training valid test steps
    def training_step(self, batch_data, batch_idx, optimizer_idx):
        # batch_data: DataModuleで定義したtrain_dataloaderの結果
        # 戻値: lossであることが必須(裏でoptimizerに渡すため)
        X_val, X_cat, X_na, zeros, ones, i = batch_data
        # train generator
        if optimizer_idx == 0:
            # 生成したデータが本物=zeroと判定されるほど良い
            gen = self(X_val, X_cat)
            dis = self.discriminator(gen)
            g_loss = self.criterion_dis(dis, zeros)
            rmse = self.criterion_gen(X_na[:, i], gen[:, i])
            tqdm_dict = {"g_loss": g_loss, "rmse": rmse}
            output = OrderedDict({"loss": g_loss+rmse, "progress_bar": tqdm_dict, "log": tqdm_dict})
        
        # train discriminator 
        else:  # optimizer_idx == 1:
            # 生成したデータが偽物=onesと判定されるほど良い
            # 本物のデータが本物=zerosと判定されるほど良い
            gen = self(X_val, X_cat)
            fake_dis = self.discriminator(gen)
            fake_loss = self.criterion_dis(fake_dis, ones)
            real_dis = self.discriminator(X_na)
            real_loss = self.criterion_dis(real_dis, zeros)
            loss = fake_loss+real_loss
            tqdm_dict = {"fake_loss": fake_loss, "real_loss": real_loss}
            output = OrderedDict({"loss": loss, "progress_bar": tqdm_dict, "log": tqdm_dict})

        return output

    def training_epoch_end(self, outputs):
        # 1epoch分の処理(全バッチの処理)のreturn値をlistで受け取る
        rmse_list = [x[0]['log']['rmse'] for x in outputs]   # genとdicの2つが返される
        g_loss_list = [x[0]['log']['g_loss'] for x in outputs]
        fake_loss_list = [x[1]['log']['fake_loss'] for x in outputs]
        real_loss_list = [x[1]['log']['real_loss'] for x in outputs]
        rmse = torch.stack(rmse_list).mean()
        g_loss = torch.stack(g_loss_list).mean()
        fake_loss = torch.stack(fake_loss_list).mean()
        real_loss = torch.stack(real_loss_list).mean()
        self.log('rmse', rmse, prog_bar=True)
        self.log('g_loss', g_loss, prog_bar=True)
        self.log('fake_loss', fake_loss, prog_bar=True)
        self.log('real_loss', real_loss, prog_bar=True)
        if (self.current_epoch+1) % CFG.print_epoch_freq == 0:
            print("epoch:", self.current_epoch, "rmse:", rmse.item(), "g_loss", g_loss.item(), "fake_loss", fake_loss.item(), "real_loss", real_loss.item())

    """
    def validation_step(self, batch_data, batch_idx):
        # 戻値: 任意の辞書
        pass

    def validation_epoch_end(self, outputs):
        pass
    """

    def predict_step(self, batch_data, batch_idx):
        # 実際に予測させるときに使う
        X_val, X_cat, X_na, _, _, _ = batch_data
        outputs = self(X_val, X_cat)
        outputs = outputs.squeeze()
        # criterionがwithLogit系の場合は、sigmoidを追加する。
        # outputs = torch.sigmoid(outputs)
        return outputs

In [None]:
class CFG:
    num_workers = 2  # colabは4, kaggleは2
    weight_decay=0
    scheduler_type="ReduceLR"
    print_epoch_freq=1
    max_epochs=5
    batch_size=1000
    lr = 1e-3
    min_lr = 1e-6
    loop_end = 20
    debug = False

if CFG.debug:
    CFG.max_epochs=1

In [None]:
checkpoint_path_of = defaultdict(str)

In [None]:
model_name_prefix = datetime.datetime.now().strftime('%m%d%H%M%S')

for loop in range(CFG.loop_end):
    train = data
    valid = data
    result_data = copy.deepcopy(data)

    dm = DataModule(train, valid, cat_col_list, val_col_list, na_col_list, CFG.batch_size)

    # create model
    pre_model_name = "model"+model_name_prefix + "_" +str(loop-1)
    model_name = "model"+model_name_prefix + "_" +str(loop)
    dirpath = "./model/"
    gen = Generator(len(val_col_list), len(cat_col_list), len(na_col_list))
    dis = Discriminator(len(na_col_list))
    model = NNModel(gen, dis)
    if checkpoint_path_of[pre_model_name] != "":
        print('read model')
        checkpint_path = checkpoint_path_of[pre_model_name]
        model.load_from_checkpoint(checkpoint_path, generator=gen, discriminator=dis)
        checkpoint = torch.load(checkpoint_path)

    # train
    logger = WandbLogger()
    logger.log_hyperparams(CFG.__dict__)
    callbacks = [
                # pl.callbacks.EarlyStopping('valid_avg_loss', patience=5),  # validation_epoch_endの戻値が10ターン改善がなかったら打ち止め
                pl.callbacks.ModelCheckpoint(dirpath="./model/", filename=model_name, save_last=True, save_weights_only=False),  # model保存の設定
                pl.callbacks.LearningRateMonitor(),  # ログに学習率を吐き出す設定
    ]
    trainer = pl.Trainer(accelerator="auto", devices="auto", max_epochs=CFG.max_epochs, logger=logger, callbacks=callbacks, enable_progress_bar=True)
    trainer.fit(model, datamodule=dm)
    wandb.finish()

    # load_best_model
    checkpoint_path = glob.glob(dirpath+model_name+"*.ckpt")[0]
    model.load_from_checkpoint(checkpoint_path, generator=gen, discriminator=dis)
    checkpoint = torch.load(checkpoint_path)
    checkpoint_path_of[model_name] = dirpath+model_name+".ckpt"

    # predict
    dm = DataModule(train, valid, cat_col_list, val_col_list, na_col_list, CFG.batch_size)
    results = trainer.predict(model=model, datamodule=dm)
    preds = []
    for batch in results:
        preds.append(batch)
    outputs = torch.cat(preds, dim=0)

    # result_data.loc[valid_index, na_col_list] = outputs.tolist()
    result_data.loc[:, na_col_list] = outputs.tolist()

    # fill_na
    for col in na_col_list:
        na_index = na_index_of[col]
        data.loc[na_index, col] = result_data.loc[na_index, col]

    gc.collect()


In [None]:
result_data.describe()

In [None]:
result_data

In [None]:
data

In [None]:
ind_list = []
val_list = []
for i in tqdm(sub.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    val = data[col][row]
    ind_list.append(i)
    val_list.append(val)

In [None]:
sub['value'].loc[ind_list] = val_list

In [None]:
sub.to_csv("submission.csv", index=True)
sub