In [None]:
# For Google Colab
"""
from google.colab import drive
drive.mount('/content/drive')

# Install kaggle packages
!pip install -q kaggle
!pip install -q kaggle-cli

# Lib
from google.colab import files

# Please Upload `kaggle.json` file
uploaded = files.upload()

# Then copy kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

!kaggle competitions download -c tabular-playground-series-jun-2022
!unzip -o tabular-playground-series-jun-2022.zip -d tabular-playground-series-jun-2022
!kaggle kernels output ehekatlact/tps2206-lgbm-gpu-loop -p ./DataSet
"""

In [None]:
%%capture
!pip install wandb
!pip install pytorch_lightning

In [None]:
import wandb

try:
    # add-ons -> secrets -> set your wandb api key
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("wandb_api")
    wandb.login(key=secret_value_0)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

### Lib

In [None]:
# common
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import time, gc, string, math
from tqdm.notebook import tqdm
import warnings
import shutil
from collections import defaultdict

# sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

# pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch.optim import lr_scheduler

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger


In [None]:
os.makedirs('model', exist_ok=True)
shutil.rmtree('./model/')
os.makedirs('model', exist_ok=True)

### Read DF

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv')
sub = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')

In [None]:
na_columns_list = data.columns.to_list()
# Process from columns with high NA
na_columns_list.sort(key=lambda x: data[x].isna().sum(), reverse=True)
while data[na_columns_list[-1]].isna().sum() == 0:
    na_columns_list.pop()

In [None]:
missing_list_of = {}
no_missing_list_of = {}
for col in na_columns_list:
    missing_list = list(np.where(data[col].isnull())[0])
    no_missing_list = list(np.where(data[col].isnull() == False)[0])
    missing_list_of[col] = missing_list
    no_missing_list_of[col] = no_missing_list

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
non_na_columns = []
for col in data.columns:
    if data[col].isna().sum() == 0:
        non_na_columns.append(col)

data[non_na_columns] = scaler.fit_transform(data[non_na_columns])

In [None]:
other_sub = pd.read_csv("../input/tps2206-lgbm-gpu-loop/submission.csv", index_col='row-col')

In [None]:
other_sub_dic = defaultdict(list)
for i in tqdm(other_sub.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    val = other_sub.loc[i, 'value']
    other_sub_dic[col].append((row, val))

for col in tqdm(data.columns):
    if not other_sub_dic[col]:
        continue
    index_value_list = other_sub_dic[col]
    index_list = []
    value_list = []
    for index, value in index_value_list:
        index_list.append(index)
        value_list.append(value)
    data[col].loc[index_list] = value_list

## Pytorch

In [None]:
class CFG:
    num_workers = 2
    weight_decay=1e-6
    scheduler_type="ReduceLR"
    print_epoch_freq=1
    max_epochs=120
    batch_size=1000
    lr = 1e-2
    debug = False

if CFG.debug:
    CFG.max_epochs=1
    na_columns_list = na_columns_list[:2]

### DataSet and DataLoader

In [None]:
class TrainDataset(Dataset):
    def __init__(self, X_train, y_train):
        self.X = X_train
        if y_train is not None:
            self.y = y_train
        else:
            self.y = torch.zeros(len(self.X), dtype=torch.float) 

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        inputs = torch.tensor(self.X[item], dtype=torch.float32)
        if self.y is None:
            outputs = None
        else:
            outputs = torch.tensor(self.y[item], dtype=torch.float32)

        return inputs, outputs

In [None]:
class DataModule(pl.LightningDataModule):
    # train, val, testの3つのDataLoaderを定義する
    # trainerにこれを渡すと、train, val, testのそれぞれのステップでこれを渡してくれる
    def __init__(self, X_train, y_train, X_valid, y_valid, X_test, batch_size):
        self.X_train = X_train.values
        self.X_valid = X_valid.values
        self.X_test = X_test.values
        self.Y_train = y_train.values
        self.Y_valid = y_valid.values
        self.Y_test = None
        self.batch_size = batch_size
        self._log_hyperparams = None  # ナニコレ・・・

    def train_dataloader(self):
        ds = TrainDataset(self.X_train, self.Y_train)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=True, pin_memory=True, drop_last=True, num_workers=CFG.num_workers, persistent_workers=True)
        return dl

    def val_dataloader(self):
        ds = TrainDataset(self.X_valid, self.Y_valid)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=True, num_workers=CFG.num_workers, persistent_workers=True)
        return dl

    def predict_dataloader(self):
        ds = TrainDataset(self.X_test, self.Y_test)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=True, num_workers=CFG.num_workers, persistent_workers=True)
        return dl

    def prepare_data_per_node(self):
        # TODO 本来要らないはずなんだけど・・・
        pass

    def teardown(self, stage=None):
        self.X_train = None
        self.X_valid = None
        self.X_test = None
        torch.cuda.empty_cache()  # TODO: これであってるのか不明　何も出てこないんだよね
        gc.collect()

### Pytorch Model

In [None]:
class DNN(nn.Module):
    def __init__(self, model_name, input_size):
        self.name = model_name
        super().__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc5 = nn.Linear(64, 32)
        self.bn5 = nn.BatchNorm1d(32)
        self.fc6 = nn.Linear(32, 1)
        self.bn6 = nn.BatchNorm1d(1)
    
    def forward(self, x):
        # dropoutとbnの併用禁止
        # bnは活性化関数の前に
        x = F.silu(self.bn1((self.fc1(x))))
        x = F.silu(self.bn2((self.fc2(x))))
        x = F.silu(self.bn3((self.fc3(x))))
        x = F.silu(self.bn4((self.fc4(x))))
        x = F.silu(self.bn5((self.fc5(x))))
        x = self.bn6((self.fc6(x)))
        return x

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

In [None]:
class NNModel(pl.LightningModule):
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model
        self.criterion = RMSELoss()

    def forward(self, x) -> torch.Tensor:
        return self.model(x)

    # Setup Optimizer and Scheduler
    def configure_optimizers(self):
        model_params = [p for n, p in self.model.named_parameters()]
        # decoder_params = [p for n, p in self.decoder.named_parameters()]
        optimizer_params = [
            {"params":  model_params,
             "weight_decay": CFG.weight_decay,
             "lr": CFG.lr
            },
        ]
        optimizer = optim.Adam(optimizer_params)

        monitor = ""
        if CFG.scheduler_type == "StepLR":
            # 一定stepごとに学習率を引き下げる
            scheduler = lr_scheduler.StepLR(optimizer, step_size=10_000, gamma=0.9)
            interval = "step"
        elif CFG.scheduler_type == "Cosine":
            # 一定step単位で周期的に学習率を増減させる
            scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                                    T_max=1000,
                                                    eta_min=1e-5,
                                                    )
            interval = "step"
        elif CFG.scheduler_type == "ReduceLR":
            # 一定epoch、valid_lossが改善しない場合、学習率を引き下げる
            scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                        'min',
                                        patience=1,
                                        factor=0.5
                                        )
            interval = "epoch"
            monitor = "valid_avg_loss"
        else:
            print("scheduler_type", CFG.scheduler_type, "is wrong")
            raise "not defined scheduler_type"
        return [optimizer],  [{"scheduler": scheduler, "interval": interval, "monitor": monitor}]

    # training valid test steps
    def training_step(self, batch_data, batch_idx):
        # batch_data: DataModuleで定義したtrain_dataloaderの結果
        # 戻値: lossであることが必須(裏でoptimizerに渡すため)
        X, y = batch_data
        outputs = self(X).squeeze()
        loss = self.criterion(outputs, y)
        return loss

    def training_epoch_end(self, outputs):
        # 1epoch分の処理(全バッチの処理)のreturn値をlistで受け取る
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('train_avg_loss', avg_loss, prog_bar=True)
        if (self.current_epoch+1) % CFG.print_epoch_freq == 0:
            print("epoch:", self.current_epoch, "train_avg_loss:", avg_loss.item())

    def validation_step(self, batch_data, batch_idx):
        # 戻値: 任意の辞書
        X, y = batch_data
        outputs = self(X).squeeze()
        loss = self.criterion(outputs, y)
        # outputs = torch.sigmoid(outputs)  # criterionがwithloss系の時はsigmoid掛ける
        return {'valid_loss': loss}

    def validation_epoch_end(self, outputs):
        # validation_stepの戻値をリストで受け取る
        avg_loss = torch.stack([x['valid_loss'] for x in outputs]).mean()
        self.log('valid_avg_loss', avg_loss, prog_bar=True)
        if (self.current_epoch+1) % CFG.print_epoch_freq == 0:
            print("epoch:", self.current_epoch, "valid_avg_loss:", avg_loss.item())
        return avg_loss

    def predict_step(self, batch_data, batch_idx):
        # 実際に予測させるときに使う
        X, _ = batch_data
        outputs = self(X).squeeze()
        # criterionがwithLogit系の場合は、sigmoidを追加する。
        # outputs = torch.sigmoid(outputs)
        return outputs

In [None]:
from sklearn.model_selection import train_test_split
import time

st_time = time.time()
for i, col in enumerate(na_columns_list):
    print("="*10, col, "="*10)
    print("{}/{}".format(i+1, len(na_columns_list)))
    if "F_4" not in col:
        print("skip {}".format(col))
        continue

    # split data
    missing_list = missing_list_of[col]
    no_missing_list = no_missing_list_of[col]

    train = data.iloc[no_missing_list,]
    test = data.iloc[missing_list,]
    X = train.drop([col,'row_id'],axis=1)
    y = train[col]
    X_test = test.drop([col,'row_id'],axis=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=42)
    dm = DataModule(X_train, y_train, X_valid, y_valid, X_test, CFG.batch_size)

    model_name = "name" + col
    dirpath = "./model/"
    dnn = DNN(model_name, len(X_train.columns))
    model = NNModel(dnn)

    # train
    logger = WandbLogger()
    logger.log_hyperparams(CFG.__dict__)
    callbacks = [
                pl.callbacks.EarlyStopping('valid_avg_loss', patience=3),  # validation_epoch_endの戻値が10ターン改善がなかったら打ち止め
                pl.callbacks.ModelCheckpoint(dirpath="./model/", filename=model_name, save_top_k=1, monitor="valid_avg_loss"),  # model保存の設定
                pl.callbacks.LearningRateMonitor(),  # ログに学習率を吐き出す設定
    ]
    trainer = pl.Trainer(accelerator="auto", devices="auto", max_epochs=CFG.max_epochs, logger=logger, callbacks=callbacks, enable_progress_bar=False)
    trainer.fit(model, datamodule=dm)

    # predict
    checkpoint = torch.load(dirpath+model_name+".ckpt")
    model.load_state_dict(checkpoint['state_dict'])
    dm = DataModule(X_train, y_train, X_valid, y_valid, X_test, CFG.batch_size)
    results = trainer.predict(model=model, datamodule=dm)
    preds = []
    for batch in results:
        preds.append(batch)
    outputs = torch.cat(preds, dim=0)

    data_all = data[col].iloc[missing_list,] = outputs.tolist()

    en_time = time.time()
    print("elapse", en_time-st_time)
    gc.collect()

In [None]:
ind_list = []
val_list = []
for i in tqdm(sub.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    val = data[col][row]
    ind_list.append(i)
    val_list.append(val)

In [None]:
sub['value'].loc[ind_list] = val_list

In [None]:
sub.to_csv("submission.csv", index=True)
sub