motivation

I found that there is a significant difference in the prediction results between columns with only one na and columns with two or more na, using the previous method.

It seems that multiple columns of na filled with mean, etc. are interacting with each other and adversely affecting the results.

Predicting by how many na columns there are improves the score.

In [None]:
# For Google Colab
"""
from google.colab import drive
drive.mount('/content/drive')

# Install kaggle packages
!pip install -q kaggle
!pip install -q kaggle-cli

# Lib
from google.colab import files

# Please Upload `kaggle.json` file
uploaded = files.upload()

# Then copy kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

!kaggle competitions download -c tabular-playground-series-jun-2022
!unzip -o tabular-playground-series-jun-2022.zip -d tabular-playground-series-jun-2022
!kaggle kernels output oxzplvifi/tps2206-gbm-resnet-imputation -p ./DataSet
"""

In [None]:
%%capture
!pip install wandb
!pip install pytorch_lightning

In [None]:
import wandb
try:
    # add-ons -> secrets -> set your wandb api key
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("wandb_api")
    wandb.login(key=secret_value_0)
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


In [None]:
wandb.init(project="tps2206")

### Lib

In [None]:
# common
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import time, gc, string, math
from tqdm.notebook import tqdm
import warnings
import shutil
from collections import defaultdict
import heapq
import datetime
import random
from collections import OrderedDict
import glob
import copy

# sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torch import optim
from torch.optim import lr_scheduler

# pytorch lightning
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger


In [None]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [None]:
# for google colab
# os.chdir("/content/drive/MyDrive/colab_data/TPS2206")

In [None]:
"""
os.makedirs('model', exist_ok=True)
shutil.rmtree('./model/')
os.makedirs('model', exist_ok=True)
"""

### Read DF

In [None]:
data = pd.read_csv('../input/tabular-playground-series-jun-2022/data.csv', index_col='row_id')
sub = pd.read_csv('../input/tabular-playground-series-jun-2022/sample_submission.csv', index_col='row-col')

In [None]:
for col in data.columns:
    if "F_4" not in col:
        data[col] = data[col].fillna(data[col].mean())

In [None]:
na_col_list = []
for col in data.columns:
    if data[col].isna().sum() != 0:
        na_col_list.append(col)

In [None]:
f4data = data[na_col_list]

In [None]:
na_index_of = {}
no_na_index_of = {}
for col in na_col_list:
    na_index = f4data[f4data[col].isna() == True].index
    na_index_of[col] = na_index
    no_na_index = f4data[f4data[col].isna() == False].index
    no_na_index_of[col] = no_na_index

In [None]:
na_cnt = pd.DataFrame(f4data.isna().sum(axis=1))

In [None]:
na_cnt.groupby([0]).size()

In [None]:
na_cnt_max = 5
na_cnt_index_of = {}
for cnt in range(0, na_cnt_max+1):
    na_cnt_index = na_cnt[na_cnt[0] == cnt].index
    na_cnt_index_of[cnt] = na_cnt_index

In [None]:
def na_no_na_index_of(col, cnt):
    na_index = na_index_of[col]
    no_na_index = no_na_index_of[col]
    na_cnt_index = na_cnt_index_of[cnt]
    na_index = na_index.intersection(na_cnt_index)
    no_na_index = no_na_index.intersection(na_cnt_index)
    return na_index, no_na_index

In [None]:
na_no_na_index_of('F_4_0', 1)

In [None]:
f4data = f4data.fillna(-1)

## Pytorch

### DataSet and DataLoader

In [None]:
class TrainDataset(Dataset):
    def __init__(self, X, y, na_num):
        self.X = X
        self.y = y
        self.na_num = na_num
        self.index_end = X.shape[1]
        self.index_list = [i for i in range(self.index_end)]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, item):
        if self.na_num != 0:
            X = self.X[item]
            # must use torch.rand* . np.random cause same seed with gpu.
            na_index_list = torch.randperm(self.index_end)[:self.na_num].tolist()
            for na_index in na_index_list:
                X[na_index] = -1
        else:
            X = self.X[item]
        inputs = torch.tensor(X, dtype=torch.float32)
        outputs = torch.tensor(self.y[item], dtype=torch.float32)

        return inputs, outputs

In [None]:
class DataModule(pl.LightningDataModule):
    # train, val, testの3つのDataLoaderを定義する
    # trainerにこれを渡すと、train, val, testのそれぞれのステップでこれを渡してくれる
    def __init__(self, X_train, y_train, X_valid, y_valid, X_test, na_num, batch_size):
        self.X_train = X_train.values
        self.y_train = y_train.values
        self.X_valid = X_valid.values
        self.y_valid = y_valid.values
        self.X_test = X_test.values
        self.y_test = np.zeros(X_test.shape[0])
        self.na_num = na_num
        self.batch_size = batch_size
        self._log_hyperparams = None  # ナニコレ・・・

    def train_dataloader(self):
        ds = TrainDataset(self.X_train, self.y_train, self.na_num)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=True, pin_memory=True, drop_last=True, num_workers=CFG.num_workers, persistent_workers=False)
        return dl

    def val_dataloader(self):
        ds = TrainDataset(self.X_valid, self.y_valid, self.na_num)
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=False, num_workers=CFG.num_workers, persistent_workers=False)
        return dl

    def predict_dataloader(self):
        ds = TrainDataset(self.X_test, self.y_test, 0)  # when predict, already fill -1
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=False, pin_memory=True, drop_last=False, num_workers=CFG.num_workers, persistent_workers=False)
        return dl

    def prepare_data_per_node(self):
        # TODO 本来要らないはずなんだけど・・・
        pass

    def teardown(self, stage=None):
        torch.cuda.empty_cache()  # TODO: これであってるのか不明　何も出てこないんだよね
        gc.collect()

### Pytorch Model

In [None]:
class DNN(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        hidden_size = 100
        output_size = 1
        emb_dim = 8
        self.fc1 = nn.Linear(input_size, hidden_size*4)
        self.bn1 = nn.BatchNorm1d(hidden_size*4)
        self.fc2 = nn.Linear(hidden_size*4, hidden_size*4)
        self.fc3 = nn.Linear(hidden_size*4, hidden_size*2)
        self.fc4 = nn.Linear(hidden_size*2, hidden_size)
        self.fc5 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # dropoutとbnの併用禁止
        # bnは活性化関数の前に
        x = F.silu(self.bn1((self.fc1(x))))
        x = F.silu(self.fc2(x))
        x = F.silu(self.fc3(x))
        x = F.silu(self.fc4(x))
        x = self.fc5(x)
        return x

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, yhat, y):
        return torch.sqrt(self.mse(yhat,y))

In [None]:
class NNModel(pl.LightningModule):
    # https://pytorch-lightning.readthedocs.io/en/stable/notebooks/lightning_examples/basic-gan.html
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model
        self.criterion = RMSELoss()
        self.lr = CFG.lr

    def forward(self, x) -> torch.Tensor:
        return self.model(x)

    # Setup Optimizer and Scheduler
    def configure_optimizers(self):
        model_params = [p for n, p in self.model.named_parameters()]
        optimizer_params = [
            {"params":  model_params,
             "weight_decay": CFG.weight_decay,
             "lr": 1e-3
            },
        ]

        optimizer = optim.Adam(optimizer_params)

        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer,
                                                'min',
                                                patience=3,
                                                factor=0.5
                                                )
        interval = "epoch"
        monitor = "valid_avg_loss"

        return [optimizer], [{"scheduler": scheduler, "interval": interval, "monitor": monitor}]

    # training valid test steps
    def training_step(self, batch_data, batch_idx):
        # batch_data: DataModuleで定義したtrain_dataloaderの結果
        # 戻値: lossであることが必須(裏でoptimizerに渡すため)
        X, y = batch_data
        op = self(X).squeeze()
        loss = self.criterion(op, y)
        return loss

    def training_epoch_end(self, outputs):
        # 1epoch分の処理(全バッチの処理)のreturn値をlistで受け取る
        loss_list = [x['loss'] for x in outputs]
        avg_loss = torch.stack(loss_list).mean()
        self.log('train_avg_loss', avg_loss, prog_bar=True)
        if (self.current_epoch+1) % CFG.print_epoch_freq == 0:
            print("epoch:", self.current_epoch, "train_avg_loss:", avg_loss.item())

    def validation_step(self, batch_data, batch_idx):
        # 戻値: 任意の辞書
        X, y = batch_data
        op = self(X).squeeze()
        loss = self.criterion(op, y)
        return {'valid_loss': loss}

    def validation_epoch_end(self, outputs):
        loss_list = [x['valid_loss'] for x in outputs]
        avg_loss = torch.stack(loss_list).mean()
        self.log('valid_avg_loss', avg_loss, prog_bar=True)
        if (self.current_epoch+1) % CFG.print_epoch_freq == 0:
            print("epoch:", self.current_epoch, "valid_avg_loss:", avg_loss.item())
        return avg_loss

    def predict_step(self, batch_data, batch_idx):
        # 実際に予測させるときに使う
        X, _ = batch_data
        outputs = self(X).squeeze()
        # criterionがwithLogit系の場合は、sigmoidを追加する。
        # outputs = torch.sigmoid(outputs)
        return outputs

In [None]:
class CFG:
    num_workers = 2  # colabは4, kaggleは2?
    weight_decay=0
    print_epoch_freq=1
    max_epochs=30
    max_batch_size=1000
    lr = 1e-3
    min_lr = 1e-6
    debug = False

if CFG.debug:
    CFG.max_epochs=1
    na_col_list = na_col_list[:1]

In [None]:
checkpoint_path_of = defaultdict(str)

In [None]:
model_name_prefix = datetime.datetime.now().strftime('%m%d%H%M%S')

for cnt in range(0, na_cnt_max):  # uso cnt+1
    print("="*10, "na_cnt {}/{}".format(cnt+1, na_cnt_max), "="*10)
    result_f4data = copy.deepcopy(f4data)
    for col in na_col_list:
        print("="*10, col, "="*10)
        # split data
        # train cnt == 0 then test cnt == 1
        _, no_na_index = na_no_na_index_of(col, 0)  # select non na records.
        train = f4data.loc[no_na_index]
        na_index, _ = na_no_na_index_of(col, cnt+1)
        if len(na_index) == 0:
            break
        test = f4data.loc[na_index]
        X = train.drop(col, axis=1)
        y = train[col]
        X_test = test.drop(col, axis=1)
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=42)

        # data module
        batch_size = min(CFG.max_batch_size, (len(X_train)+100-1)//100)  # len(X) == batch then raise errer.
        print("batch_size :{}".format(batch_size))
        dm = DataModule(X_train, y_train, X_valid, y_valid, X_test, cnt, batch_size)

        # create model
        cur_model_name = "model" + model_name_prefix+"_" + col+ "_" + str(cnt)
        dirpath = "./model/"
        dnn = DNN(X_train.shape[1])
        model = NNModel(dnn)

        # train
        logger = WandbLogger()
        logger.log_hyperparams(CFG.__dict__)
        callbacks = [
                    pl.callbacks.EarlyStopping('valid_avg_loss', patience=10),  # validation_epoch_endの戻値が10ターン改善がなかったら打ち止め
                    pl.callbacks.ModelCheckpoint(dirpath="./model/", filename=cur_model_name, save_top_k=1, monitor="valid_avg_loss", save_weights_only=False),  # model保存の設定
                    pl.callbacks.LearningRateMonitor(),  # ログに学習率を吐き出す設定
        ]
        trainer = pl.Trainer(accelerator="auto", devices="auto", max_epochs=CFG.max_epochs, logger=logger, callbacks=callbacks, enable_progress_bar=False)
        trainer.fit(model, datamodule=dm)
        wandb.finish()

        # load_best_model
        checkpoint_path = glob.glob(dirpath+cur_model_name+"*.ckpt")[0]
        model.load_from_checkpoint(checkpoint_path, model=dnn)
        checkpoint_path_of[cur_model_name] = checkpoint_path

        # predict
        dm = DataModule(X_train, y_train, X_valid, y_valid, X_test, cnt, batch_size)
        results = trainer.predict(model=model, datamodule=dm)
        preds = []
        for batch in results:
            preds.append(batch)
        outputs = torch.cat(preds, dim=0)

        # write result
        result_f4data.loc[na_index, col] = outputs.tolist()
        display(result_f4data.loc[na_index, col].head())

        torch.cuda.empty_cache()
        gc.collect()
    f4data = result_f4data
    f4data.to_pickle("f4data_{}.pkl".format(cnt))


In [None]:
data.loc[:, f4data.columns] = f4data

In [None]:
data.head()

In [None]:
ind_list = []
val_list = []
for i in tqdm(sub.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    val = data[col][row]
    ind_list.append(i)
    val_list.append(val)

In [None]:
sub['value'].loc[ind_list] = val_list

In [None]:
sub.to_csv("submission.csv", index=True)
sub

TODO: fine tune版も試してみる。案外悪くないかも・・・？