In [1]:
import os
import io
import pickle
from glob import glob

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from contextlib import contextmanager
from time import time
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

import wandb
from wandb.lightgbm import wandb_callback, log_summary
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# Config

In [2]:

# EXP_NAME = "exp007"
EXP_NAME = "debug"

class configs:
    EXP_CATEGORY = "baseline"
    EXP_NAME = EXP_NAME
    OUTPUT_DIR = os.path.join("/workspace", "working", EXP_NAME)
    
    INPUT_DIR = os.path.join("/workspace", "input", "atmaCup15_dataset")
    ORIG_TRAIN_CSV = os.path.join(INPUT_DIR, "train.csv")
    ORIG_TEST_CSV = os.path.join(INPUT_DIR, "test.csv")
    
    TRAIN_CSV = "/workspace/working/anime_svd/train_anime2vec.csv"
    ANIME_CSV = os.path.join(INPUT_DIR, "anime.csv")
    TEST_CSV = "/workspace/working/anime_svd/test_anime2vec.csv"
    SAMPLE_SUB_CSV = os.path.join(INPUT_DIR, "sample_submission.csv")
    target_colname = "score"
    unused_cols = ["fold", "score", "oof", "seen"]
    W2V_MODEL_PATH = "/workspace/working/word2vec.gensim.model"
    
    COMPETITION = "atmaCup15"
    USER_NAME = "taro"
    wandb_available = True
    
    # train
    num_boost_round = 1000
    early_stopping_rounds = 200
    verbose_eval = 500
    FOLDS = [0, 1, 2, 3, 4]

    TRAIN_BATCH_SIZE = 128
    VALID_BATCH_SIZE = 128
    EPOCHS = 30
    # EPOCHS = 2
    LEARNING_RATE = 1e-2


if EXP_NAME is "debug":
    configs.wandb_available = False
    configs.num_boost_round = 10
else:
    os.makedirs(configs.OUTPUT_DIR)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

# Utils

In [4]:
class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" "):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)


In [5]:
def root_mean_squared_error(y_true, y_pred):
    """mean_squared_error の root (0.5乗)"""
    return mean_squared_error(y_true, y_pred) ** .5

## utils for data

In [6]:
def merge_by_anime_id(left_df, right_df):
    return pd.merge(left_df["anime_id"], right_df, on="anime_id", how="left").drop(columns=["anime_id"])

## features func

In [7]:
def create_anime_numeric_feature(input_df: pd.DataFrame):
    """input_dfは train or test.csv のデータが入ってくることを想定しています."""
    
    use_columns = [
        "members", 
    ]
    
    return merge_by_anime_id(input_df, anime_df)[use_columns]

#### label encoding

In [8]:
def create_anime_genres_label_encoding(input_df: pd.DataFrame):
    target_col = "genres"
    encoder = LabelEncoder()
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_le": encoder.fit_transform(anime_df[target_col].fillna("nan"))
    })
    return merge_by_anime_id(input_df, encoded_df)

In [9]:
def create_anime_source_label_encoding(input_df: pd.DataFrame):
    target_col = "source"
    encoder = LabelEncoder()
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_le": encoder.fit_transform(anime_df[target_col].fillna("nan"))
    })
    return merge_by_anime_id(input_df, encoded_df)

#### count encoding

In [10]:
# animeのtypeをカウントエンコーディング
def create_anime_type_count_encoding(input_df: pd.DataFrame):
    target_col = "type"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [11]:
def create_anime_studios_count_encoding(input_df: pd.DataFrame):
    target_col = "studios"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [12]:
def create_anime_producers_count_encoding(input_df: pd.DataFrame):
    target_col = "producers"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [13]:
def create_anime_animeid_count_encoding(input_df: pd.DataFrame):
    target_col = "anime_id"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

#### one-hot encoding

In [14]:
# animeのtypeをone-hotエンコーディング
def create_anime_type_one_hot_encoding(input_df: pd.DataFrame):
    # 対象の列のユニーク集合を取る
    target_colname = "type"
    target_series = anime_df[target_colname]
    unique_values = target_series.unique()

    # ユニークな値ごとに列を作る
    out_df = pd.DataFrame()
    for value in unique_values:
        is_value = target_series == value
        if value == "Unknown":
            out_df["Unknown_type"] = is_value.astype(int)
        else:
            out_df[value] = is_value.astype(int)
    
    out_df["anime_id"] = anime_df["anime_id"]
    return merge_by_anime_id(input_df, out_df)

In [15]:
# animeのtypeをone-hotエンコーディング
def create_anime_rating_one_hot_encoding(input_df: pd.DataFrame):
    # 対象の列のユニーク集合を取る
    target_colname = "rating"
    target_series = anime_df[target_colname]
    unique_values = target_series.unique()

    # ユニークな値ごとに列を作る
    out_df = pd.DataFrame()
    for value in unique_values:
        is_value = target_series == value
        if value == "Unknown":
            out_df["Unknown_rate"] = is_value.astype(int)
        else:
            out_df[value] = is_value.astype(int)
    
    out_df["anime_id"] = anime_df["anime_id"]
    return merge_by_anime_id(input_df, out_df)

In [16]:
# 上で定義した関数をまとめて実行
def create_feature(input_df, config_):
    # functions に特徴量作成関数を配列で定義しました.
    # どの関数も同じ input / output のインターフェイスなので for で回せて嬉しいですね ;)
    functions = [
        create_anime_numeric_feature,
        # label encoding
        create_anime_genres_label_encoding, 
        create_anime_source_label_encoding, 
        # count encoding
        # create_anime_type_count_encoding,
        create_anime_studios_count_encoding,
        create_anime_producers_count_encoding,
        create_anime_animeid_count_encoding, 
        # one-hot encoding
        create_anime_type_one_hot_encoding,
        create_anime_rating_one_hot_encoding,
    ]
    
    out_df = pd.DataFrame()
    func_name_list = []
    for func in functions:
        func_name = str(func.__name__)
        func_name_list.append(func_name)
        with Timer(prefix=f"create {func_name}"):
            _df = func(input_df)
        out_df = pd.concat([out_df, _df], axis=1)
    
    config_.preprocess_funcs = func_name_list
    return out_df, config_

In [17]:
def save_model(model, fold):
    save_path = os.path.join(configs.OUTPUT_DIR, f"model_fold{fold}.pkl")
    # pickle.dump(model, save_path)
    with open(save_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"SAVED: {save_path}")

# Loss

In [18]:
# def cls_to_reg(y_pred):
#     # class予測を線形和で1-10に変換。
#     return (y_pred * np.arange(1,11).reshape(1,10)).mean(axis=1)

# def custom_loss(y_true, y_pred, alpha=0.5):
#     """
#     y_true [batch]
#     y_pred [batch, 10]
#     """
#     y_true_onehot = np.eye(10)[y_true]
#     ce_loss = (- y_true_onehot  * np.log(y_pred)).sum(axis=1).mean()
#     # 回帰風ロスを追加で使ってもいい。
#     y_pred_continuous = cls_to_reg(y_pred)
#     mse_loss = ((y_true - y_pred_continuous )**2).mean()
#     return ce_loss + alpha * mse_loss

def cls_to_reg(y_pred):
    # class予測を線形和で1-10に変換。
    return (y_pred * torch.arange(1,11).view(1,10).to(device)).mean(axis=1)

# def custom_loss(y_true, y_pred, alpha=0.5, smooth=1e-9):
def custom_loss(y_pred, y_true, alpha=0.5, smooth=1e-9):
    # y_true_cast = torch.tensor(y_true-1, dtype=torch.long)
    # y_true_onehot = torch.eye(10).to(device)
    # y_true_onehot = y_true_onehot[y_true_cast]
    # y_true_cast = y_true_cast.to(device)
    # ce_loss = (- y_true_onehot  * torch.log(y_pred + smooth)).sum(axis=1).mean()
    CE_Loss = nn.CrossEntropyLoss()
    ce_loss = CE_Loss(y_pred, torch.y_true)
    y_pred_continuous = cls_to_reg(y_pred)
    mse_loss = ((y_true - y_pred_continuous )**2).mean()
    return ce_loss + alpha * mse_loss

class CustomLoss():
    def __init__(self, alpha=0.5, smooth=1e-9):
        self.CE_LOSS = nn.CrossEntropyLoss()
        self.alpha = alpha
    
    def __call__(self, y_pred, y_true):
        y_pred_continuous = cls_to_reg(y_pred)
        ce_loss = self.CE_LOSS(y_pred, y_true).to(device)
        mse_loss = ((y_true - y_pred_continuous )**2).mean()
        return ce_loss + self.alpha * mse_loss


# Dataset

In [19]:

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        inputs = self.X[idx] 
        labels = torch.tensor(self.y[idx] - 1, dtype=torch.long)
        return inputs, labels

# Model

In [20]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size=32, output_size=10):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# train fn

In [21]:
def fit_nn_model(df):
    models = []
    evals_results_list = [] 
    n_records = len(df)
    # training data の target と同じだけのゼロ配列を用意
    oof_pred = np.zeros((n_records, ), dtype=np.float32)
    target = []
    for fold in configs.FOLDS: 
        # この部分が交差検証のところです。データセットを cv instance によって分割します
        # training data を trian/valid に分割
        train_df_ = df[df["fold"] != fold].reset_index(drop=True)
        valid_df_ = df[df["fold"] == fold].reset_index(drop=True)
        idx_valid = df[df["fold"] == fold].index.values
        
        
        feat_cols = [col for col in train_df_.columns if col not in configs.unused_cols]
        print("feature nums = ", len(feat_cols))
        
        # dataset を作成
        train_dataset = CustomDataset(train_df_[feat_cols], train_df_[configs.target_colname])         
        valid_dataset = CustomDataset(valid_df_[feat_cols], valid_df_[configs.target_colname])         

        # dataloader を作成
        train_loader = DataLoader(train_dataset, batch_size=configs.TRAIN_BATCH_SIZE, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=configs.VALID_BATCH_SIZE, shuffle=False)
        
        input_size = len(feat_cols)
        model = SimpleNN(input_size)
        model.to(device)
        # criterion = nn.MSELoss()
        # criterion = custom_loss
        criterion = CustomLoss()
        # criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=configs.LEARNING_RATE)
        for epoch in range(configs.EPOCHS):
            train_loss = 0.0
            valid_loss = 0.0
            model.train()
            for i, (inputs, labels) in enumerate(train_loader):
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * inputs.size(0)
            train_loss = train_loss / len(train_loader.dataset)

            model.eval()
            for i, (inputs, labels) in enumerate(valid_loader):
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                valid_loss += loss.item() * inputs.size(0)
            valid_loss = valid_loss / len(valid_loader.dataset)
            if epoch % 10 == 0:            
                print(f"epoch {epoch} train_loss {train_loss:.4f} valid_loss {valid_loss:.4f}")
            # print(f"epoch {epoch} train_loss {train_loss:.4f} valid_loss {valid_loss:.4f}")


        # この fold でのモデルを保存
        model.eval()
        models.append(model)
        oof_preds, targets = [], []
        # oof 予測
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                
                outputs = torch.softmax(outputs, dim=1)
                outputs = outputs.detach().cpu().numpy()
                # 10class 分類の場合
                outputs = np.argmax(outputs, axis=1)
                oof_preds.extend(outputs.reshape(-1).tolist())
                targets.extend(labels.detach().cpu().numpy().reshape(-1))
        # 評価
        rmse = root_mean_squared_error(targets, oof_preds)
        print(f"fold {fold}:  rmse {rmse}")
    
        oof_pred[idx_valid] = oof_preds
    # oof 予測値を保存
    rmse = root_mean_squared_error(df[configs.target_colname], oof_pred)
    print(f"OVERALL: rmse {rmse}")
    return models, oof_pred

# Load data

In [22]:
anime_df = pd.read_csv(configs.ANIME_CSV)

train_df = pd.read_csv(configs.TRAIN_CSV)
test_df = pd.read_csv(configs.TEST_CSV)

original_train_df = pd.read_csv(configs.ORIG_TRAIN_CSV)
original_test_df = pd.read_csv(configs.ORIG_TEST_CSV)

In [23]:
train_df = pd.merge(train_df, original_train_df, on=["user_id", "anime_id"], how="left")
test_df = pd.merge(test_df, original_test_df, on=["user_id", "anime_id"], how="left")

## preprocess
- CountEncoding
- OneHotEncoding

In [24]:
# 実行して train / test 用の特徴量を作ります.

with Timer(prefix="train..."):
    train_feat_df, configs = create_feature(train_df, configs)

with Timer(prefix="test..."):
    test_feat_df, configs = create_feature(test_df, configs)

# X = train_feat_df.values
# print(train_feat_df.columns)
# y = train_df["score"].values


create create_anime_numeric_feature 0.050[s]
create create_anime_genres_label_encoding 0.015[s]
create create_anime_source_label_encoding 0.013[s]
create create_anime_studios_count_encoding 0.013[s]
create create_anime_producers_count_encoding 0.013[s]
create create_anime_animeid_count_encoding 0.011[s]
create create_anime_type_one_hot_encoding 0.015[s]
create create_anime_rating_one_hot_encoding 0.018[s]
train... 0.158[s]
create create_anime_numeric_feature 0.040[s]
create create_anime_genres_label_encoding 0.014[s]
create create_anime_source_label_encoding 0.011[s]
create create_anime_studios_count_encoding 0.011[s]
create create_anime_producers_count_encoding 0.011[s]
create create_anime_animeid_count_encoding 0.015[s]
create create_anime_type_one_hot_encoding 0.017[s]
create create_anime_rating_one_hot_encoding 0.016[s]
test... 0.142[s]


## utils for data

In [25]:
def merge_by_anime_id(left_df, right_df):
    return pd.merge(left_df["anime_id"], right_df, on="anime_id", how="left").drop(columns=["anime_id"])

## features func

In [26]:
def create_anime_numeric_feature(input_df: pd.DataFrame):
    """input_dfは train or test.csv のデータが入ってくることを想定しています."""
    
    use_columns = [
        "members", 
    ]
    
    return merge_by_anime_id(input_df, anime_df)[use_columns]

#### label encoding

In [27]:
def create_anime_genres_label_encoding(input_df: pd.DataFrame):
    target_col = "genres"
    encoder = LabelEncoder()
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_le": encoder.fit_transform(anime_df[target_col].fillna("nan"))
    })
    return merge_by_anime_id(input_df, encoded_df)

In [28]:
def create_anime_source_label_encoding(input_df: pd.DataFrame):
    target_col = "source"
    encoder = LabelEncoder()
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_le": encoder.fit_transform(anime_df[target_col].fillna("nan"))
    })
    return merge_by_anime_id(input_df, encoded_df)

#### count encoding

In [29]:
# animeのtypeをカウントエンコーディング
def create_anime_type_count_encoding(input_df: pd.DataFrame):
    target_col = "type"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [30]:
def create_anime_studios_count_encoding(input_df: pd.DataFrame):
    target_col = "studios"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [31]:
def create_anime_producers_count_encoding(input_df: pd.DataFrame):
    target_col = "producers"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

In [32]:
def create_anime_animeid_count_encoding(input_df: pd.DataFrame):
    target_col = "anime_id"
    count = anime_df[target_col].map(anime_df["type"].value_counts())
    encoded_df = pd.DataFrame({
        "anime_id": anime_df["anime_id"],
        f"{target_col}_count": count
    })
    
    return merge_by_anime_id(input_df, encoded_df)

#### one-hot encoding

In [33]:
# animeのtypeをone-hotエンコーディング
def create_anime_type_one_hot_encoding(input_df: pd.DataFrame):
    # 対象の列のユニーク集合を取る
    target_colname = "type"
    target_series = anime_df[target_colname]
    unique_values = target_series.unique()

    # ユニークな値ごとに列を作る
    out_df = pd.DataFrame()
    for value in unique_values:
        is_value = target_series == value
        if value == "Unknown":
            out_df["Unknown_type"] = is_value.astype(int)
        else:
            out_df[value] = is_value.astype(int)
    
    out_df["anime_id"] = anime_df["anime_id"]
    return merge_by_anime_id(input_df, out_df)

In [34]:
# animeのtypeをone-hotエンコーディング
def create_anime_rating_one_hot_encoding(input_df: pd.DataFrame):
    # 対象の列のユニーク集合を取る
    target_colname = "rating"
    target_series = anime_df[target_colname]
    unique_values = target_series.unique()

    # ユニークな値ごとに列を作る
    out_df = pd.DataFrame()
    for value in unique_values:
        is_value = target_series == value
        if value == "Unknown":
            out_df["Unknown_rate"] = is_value.astype(int)
        else:
            out_df[value] = is_value.astype(int)
    
    out_df["anime_id"] = anime_df["anime_id"]
    return merge_by_anime_id(input_df, out_df)

In [35]:
# 上で定義した関数をまとめて実行
def create_feature(input_df, config_):
    # functions に特徴量作成関数を配列で定義しました.
    # どの関数も同じ input / output のインターフェイスなので for で回せて嬉しいですね ;)
    functions = [
        create_anime_numeric_feature,
        # label encoding
        create_anime_genres_label_encoding, 
        create_anime_source_label_encoding, 
        # count encoding
        # create_anime_type_count_encoding,
        create_anime_studios_count_encoding,
        create_anime_producers_count_encoding,
        create_anime_animeid_count_encoding, 
        # one-hot encoding
        create_anime_type_one_hot_encoding,
        create_anime_rating_one_hot_encoding,
    ]
    
    out_df = pd.DataFrame()
    func_name_list = []
    for func in functions:
        func_name = str(func.__name__)
        func_name_list.append(func_name)
        with Timer(prefix=f"create {func_name}"):
            _df = func(input_df)
        out_df = pd.concat([out_df, _df], axis=1)
    
    config_.preprocess_funcs = func_name_list
    return out_df, config_

In [36]:
input_df = pd.concat([train_df, train_feat_df], axis=1)
input_df = input_df.drop(["user_id", "anime_id"], axis=1)
input_df = input_df.fillna(0)

In [37]:
input_df.columns

Index(['wo_score_user_factor_0', 'wo_score_user_factor_1',
       'wo_score_user_factor_2', 'wo_score_user_factor_3',
       'wo_score_user_factor_4', 'wo_score_user_factor_5',
       'wo_score_user_factor_6', 'wo_score_user_factor_7',
       'wo_score_user_factor_8', 'wo_score_user_factor_9',
       'wo_score_user_factor_10', 'wo_score_user_factor_11',
       'wo_score_user_factor_12', 'wo_score_user_factor_13',
       'wo_score_user_factor_14', 'wo_score_user_factor_15',
       'wo_score_user_factor_16', 'wo_score_user_factor_17',
       'wo_score_user_factor_18', 'wo_score_user_factor_19',
       'wo_score_user_factor_20', 'wo_score_user_factor_21',
       'wo_score_user_factor_22', 'wo_score_user_factor_23',
       'wo_score_user_factor_24', 'wo_score_user_factor_25',
       'wo_score_user_factor_26', 'wo_score_user_factor_27',
       'wo_score_user_factor_28', 'wo_score_user_factor_29',
       'wo_score_user_factor_30', 'wo_score_user_factor_31',
       'wo_score_user_factor_32', 

# Train

In [38]:
models, oof_pred = fit_nn_model(input_df)

feature nums =  85
epoch 0 train_loss 5065508.8644 valid_loss 10532.0888
epoch 10 train_loss 3.8804 valid_loss 3.4119
epoch 20 train_loss 3.0082 valid_loss 3.0079
fold 0:  rmse 1.5794281581309637
feature nums =  85
epoch 0 train_loss 5899303.8610 valid_loss 4093.3482
epoch 10 train_loss 3.4508 valid_loss 3.3833
epoch 20 train_loss 3.0079 valid_loss 3.0075
fold 1:  rmse 1.5793642694455174
feature nums =  85
epoch 0 train_loss 1907817.3848 valid_loss 3923.4999
epoch 10 train_loss 3.3549 valid_loss 3.3127
epoch 20 train_loss 3.0077 valid_loss 3.0065
fold 2:  rmse 1.5793526644513813
feature nums =  85
epoch 0 train_loss 7334626.8646 valid_loss 2815.1057
epoch 10 train_loss 108.4727 valid_loss 114.6432
epoch 20 train_loss 3.0100 valid_loss 3.0093
fold 3:  rmse 1.5795963514237064
feature nums =  85
epoch 0 train_loss 5564094.3717 valid_loss 18567.3366
epoch 10 train_loss 4.2503 valid_loss 3.8570
epoch 20 train_loss 3.0095 valid_loss 3.0094
fold 4:  rmse 1.5795963514237064
OVERALL: rmse {rmse

# Result

# Inference

['svd', 'members', 'tyoe_count', 'TV', 'Special', 'Movie', 'Unknown','ONA', 'OVA', 'Music']

In [39]:
test_df = pd.concat([test_df, test_feat_df], axis=1)
test_df = test_df[feat_cols]
print(test_df.columns)
print(len(test_df.columns))

NameError: name 'feat_cols' is not defined

In [None]:
# k 個のモデルの予測を作成. shape = (5, N_test,).
pred = np.array([model.predict(test_df.values) for model in models])


# k 個のモデルの予測値の平均 shape = (N_test,).
pred = np.mean(pred, axis=0) # axis=0 なので shape の `k` が潰れる 

## Inference predictions

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

vmax = .02
bins = np.linspace(0, 10, 100)
ax.hist(pred, bins=bins, density=True, alpha=.5, label="Test")
ax.hist(oof, bins=bins, density=True, alpha=.5, label="OutOfFold")
ax.grid()
ax.legend()
# ax.set_title("テストと学習時の予測傾向差分")

fig.tight_layout()

# save infer csv

In [None]:
pd.DataFrame({
    "score": pred
}).to_csv(os.path.join(configs.OUTPUT_DIR, "submission.csv"), index=False)


In [None]:
oof_score