# ProbSpace: YouTube動画視聴回数予測

In [None]:
out_dir = "out_tmp"
!mkdir -p $out_dir

In [None]:
import pandas as pd
import numpy as np
import scipy

import itertools
import os, datetime, gc, re, random
import time, datetime
import pickle
from tqdm.notebook import tqdm
from imblearn.over_sampling import SMOTE
import optuna
import bhtsne, umap

from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.tokenfilter import *
from janome.charfilter import UnicodeNormalizeCharFilter, RegexReplaceCharFilter
import unicodedata

import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LogisticRegression, Ridge, SGDRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor

from ngboost import NGBRegressor
from ngboost.ngboost import NGBoost
from ngboost.learners import default_tree_learner
from ngboost.scores import MLE, CRPS, LogScore
from ngboost.distns import Normal, LogNormal

from sklearn.linear_model import BayesianRidge, ElasticNet, Lasso, LogisticRegression, Ridge, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, cross_validate, cross_val_predict, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, Normalizer, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.feature_selection import SelectFromModel, RFE, SelectPercentile, SelectKBest

import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from tensorflow.keras.metrics import MeanSquaredError
from tensorflow.keras.optimizers import Adadelta, Adagrad, Adam, Adamax, Ftrl, Nadam, RMSprop, SGD
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping, TensorBoard, LambdaCallback, ReduceLROnPlateau

from tensorflow.keras import layers
from tensorflow.keras.layers import Concatenate, Lambda
from tensorflow.keras.layers import Activation, Average, Dense, Dropout, Flatten, BatchNormalization, LeakyReLU, Input
from tensorflow.keras.layers import GaussianDropout, GaussianNoise
from tensorflow.keras.layers import Conv2D, SeparableConv2D, MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D

from tensorflow.keras.losses import MeanSquaredLogarithmicError

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno 

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 100)

In [None]:
start = datetime.datetime.now()

In [None]:
# Function for variable description
def description(df):
    summary = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary = summary.reset_index()
    summary["Name"] = summary['index']
    summary = summary[["Name",'dtypes']]
    summary["Missing"] = df.isnull().sum().values    
    summary["Uniques"] = df.nunique().values
    summary["Mean"] = np.nanmean(df, axis=0).astype(df.dtypes)
    summary["Std"] = np.nanstd(df, axis=0).astype(df.dtypes)
    summary["Minimum"] = np.nanmin(df, axis=0).astype(df.dtypes)
    summary["Maximum"] = np.nanmax(df, axis=0).astype(df.dtypes)
    summary["First Value"] = df.iloc[0].values
    summary["Second Value"] = df.iloc[1].values
    summary["Third Value"] = df.iloc[2].values
    summary["dimension"] = str(df.shape)
    return summary

In [None]:
def get_hist(target):
    plt.hist(target, bins=100)

    print("max:  {:>10,.6f}".format(target.max()))
    print("min:  {:>10,.6f}".format(target.min()))
    print("mean: {:>10,.6f}".format(target.mean()))
    print("std:  {:>10,.6f}".format(target.std()))
    
    return

def get_hist4(target1, title1, target2, title2, target3, title3, target4, title4):
    fig = plt.figure(figsize=(18, 18))

    ax1 = fig.add_subplot(5,1,1)
    ax2 = fig.add_subplot(5,1,2)
    ax3 = fig.add_subplot(5,1,3)
    ax4 = fig.add_subplot(5,1,4)
    ax5 = fig.add_subplot(5,1,5)

    ax1.set_title(title1)
    ax2.set_title(title2)
    ax3.set_title(title3)
    ax4.set_title(title4)
    ax5.set_title("OVERALL")
    
    ax1.hist(target1, bins=100)
    ax2.hist(target2, bins=100)
    ax3.hist(target3, bins=100)
    ax4.hist(target4, bins=100)

    ax5.hist(target1, bins=100, alpha=0.2, color='red')
    ax5.hist(target2, bins=100, alpha=0.2, color='green')
    ax5.hist(target3, bins=100, alpha=0.2, color='blue')
    #ax5.hist(target4, bins=100, alpha=0.2, color='grey')

    fig.show()

    return


In [None]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch
  
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Squared Log Error [y]')
  plt.plot(hist['epoch'], hist['root_mean_squared_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_root_mean_squared_error'],
           label = 'Val Error')
  plt.ylim([0,5])
  plt.legend()

## Load Data

In [None]:
%%time

# for train/test data
train_data = pd.read_csv("./input/train_data.csv")
test_data = pd.read_csv("./input/test_data.csv")

y = np.log1p(train_data['y']).copy()
test_id = test_data.id

train = train_data.drop(['id', 'y'], axis=1).copy()
test  = test_data.drop(['id'], axis=1).copy()

cols_to_log = ['likes', 'dislikes', 'comment_count']
train[cols_to_log] = np.log1p(train[cols_to_log])
test[cols_to_log]  = np.log1p(test[cols_to_log])

traintest = pd.concat([train, test]).reset_index(drop=True)


## 目的変数の分布

In [None]:
get_hist(y)

## missing data

In [None]:
for col in train_data.columns:
    print("{:<20}: {} ({:.2f}%)".format(col, train_data[col].isnull().sum(), train_data[col].isnull().sum()/train_data.shape[0]*100))

In [None]:
for col in test_data.columns:
    print("{:<20}: {} ({:.2f}%)".format(col, test_data[col].isnull().sum(), test_data[col].isnull().sum()/test_data.shape[0]*100))

# seedの固定化

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)


# 特徴量生成

In [None]:
def create_features(df, features):
    # 欠損値処理
    df['tags'].fillna("[none]", inplace=True)
    df['description'].fillna(df['tags'].replace("|", " ") + df['title'], inplace=True)

    tokenizer = Tokenizer()
    char_filters = [UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r"[0123456789!#$%&()=~|\-^\\@`{;:+*},./\<>?_♪®」—]", "")]
    token_filters = [POSKeepFilter(['名詞'])]
    #token_filters = [POSStopFilter(['接続詞', '接頭辞', '接尾辞', '記号', '助詞', '助動詞']), TokenCountFilter()]
    a = Analyzer(char_filters, tokenizer, token_filters=token_filters)

    df.loc[df['tags']=="[none]", 'tags'] = \
    df['title'][df['tags']=="[none]"].str.lower().apply(lambda x: "|".join([word.surface for word in a.analyze(x)]))
    
    # ------------------------------------------
    # ビニング
    # ------------------------------------------
    df['likes_qcut'] = pd.qcut(df['likes'], 10000, False, duplicates='drop')
    df['dislikes_qcut'] = pd.qcut(df['dislikes'], 10000, False, duplicates='drop')
    df['comment_count_qcut'] = pd.qcut(df['comment_count'], 10000, False, duplicates='drop')
    features["cat"] += ['likes_qcut', 'dislikes_qcut', 'comment_count_qcut']

    df['likes_cut'] = pd.cut(df['likes'], np.ceil(df['likes'].max()+1).astype('int'), False, labels=False, duplicates='drop', include_lowest=True)
    df['dislikes_cut'] = pd.cut(df['dislikes'], np.ceil(df['dislikes'].max()+1).astype('int'), False, labels=False, duplicates='drop', include_lowest=True)
    df['comment_count_cut'] = pd.cut(df['comment_count'], np.ceil(df['comment_count'].max()+1).astype('int'), False, labels=False, duplicates='drop', include_lowest=True)
    features["cat"] += ['likes_cut', 'dislikes_cut', 'comment_count_cut']

    # ------------------------------------------
    # 比率
    # ------------------------------------------
    df['likes_dislikes_ratio'] = df['likes']/(df['dislikes']+1)
    df['comment_count_likes_ratio'] = df['comment_count']/(df['likes']+1)
    df['comment_count_dislikes_ratio'] = df['comment_count']/(df['dislikes']+1)

    features["num"] += ['likes_dislikes_ratio', 'comment_count_likes_ratio', 'comment_count_dislikes_ratio']

    # ------------------------------------------
    # 積算
    # ------------------------------------------
    df['mul_likes_comments_disabled'] = df['likes'] * df['comments_disabled']
    df['mul_dislikes_comments_disabled'] = df['dislikes'] * df['comments_disabled']
    df['mul_comment_count_ratings_disabled'] = df['comment_count'] * df['ratings_disabled']

    features["num"] += ['mul_likes_comments_disabled', 'mul_dislikes_comments_disabled', 'mul_comment_count_ratings_disabled']
    
    # ------------------------------------------
    # 出現頻度
    # ------------------------------------------
    for col in ['channelId', 'channelTitle', 'categoryId']:
        df['_'.join(list(map(str, ['freq', col])))] = df[col].map(df[col].value_counts())
        
        features["num"] += ['_'.join(list(map(str, ['freq', col])))]

    # ------------------------------------------
    # 'n_tags'数特徴量の生成
    # ------------------------------------------    
    df['n_tags'] = df['tags'].astype(str).apply(lambda x: len(x.split("|")))
    features["num"] += ['n_tags']
    
    # ------------------------------------------
    # 'description'関連の特徴量生成
    # ------------------------------------------    
    df['http_count_in_desc'] = df['description'].apply(lambda x: x.lower().count("http"))
    df['len_description'] = df['description'].apply(lambda x: len(x))
    df['len_title'] = df['title'].apply(lambda x: len(x))

    features["num"] += ['http_count_in_desc', 'len_description', 'len_title']
    
    # ------------------------------------------
    # 'title'/'tag'/'description'内の記述言語関連特徴量の生成
    # ------------------------------------------    
    def checkJapanese(word):
        for ch in word:
            try:
                name = unicodedata.name(ch) 
                if "CJK UNIFIED" in name \
                or "HIRAGANA" in name \
                or "KATAKANA" in name:
                    return True
            except:
              continue
        return False

    def checkAlnum(word):
        alnum = re.compile(r'^[a-zA-Z0-9]+$')
        result = alnum.match(word) is not None
        return result

    # is japanese
    df['isJa_title'] = df['title'].apply(lambda x: checkJapanese(x))
    df['isJa_tags'] = df['tags'].apply(lambda x: checkJapanese(x))
    df['isJa_description'] = df['description'].apply(lambda x: checkJapanese(x))

    features["ohe"] += ['isJa_title', 'isJa_tags', 'isJa_description']
    
    # isalnum
    df['isalnum_title'] = df['title'].apply(lambda x: checkAlnum(x))
    df['isalnum_tags'] = df['tags'].apply(lambda x: checkAlnum(x))
    df['isalnum_description'] = df['description'].apply(lambda x: checkAlnum(x))

    features["ohe"] += ['isalnum_title', 'isalnum_tags', 'isalnum_description']
    
    # cotain english
    df['inclEn_title'] = df['title'].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
    df['inclEn_tags'] = df['tags'].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
    df['inclEn_description'] = df['description'].apply(lambda x: len(re.findall(r'[a-zA-Z0-9]', x.lower())))
    
    features["num"] += ['inclEn_title', 'inclEn_tags', 'inclEn_description']

    # ------------------------------------------
    # 投稿時期、採取時期からの期間、日時関連特徴量の生成
    # ------------------------------------------    
    # publishedAt
    df['publishedAt'] = pd.to_datetime(df['publishedAt'], utc=True)
    df['publishedAt_year'] = df['publishedAt'].apply(lambda x: x.year)
    df['publishedAt_month'] = df['publishedAt'].apply(lambda x: x.month)
    df['publishedAt_day'] = df['publishedAt'].apply(lambda x: x.day)
    df['publishedAt_hour'] = df['publishedAt'].apply(lambda x: x.hour)
    df['publishedAt_minute'] = df['publishedAt'].apply(lambda x: x.minute)
    df['publishedAt_second'] = df['publishedAt'].apply(lambda x: x.second)
    df['publishedAt_dayofweek'] = df['publishedAt'].apply(lambda x: x.dayofweek)

    df['collection_date'] = \
    df['collection_date'].map(lambda x: x.split('.')).map(lambda x: '20'+x[0]+'-'+x[2]+'-'+x[1]+'T00:00:00.000Z')
        
    # collection_date
    df['collection_date'] = pd.to_datetime(df['collection_date'], utc=True)
    df['collection_date_year'] = df['collection_date'].apply(lambda x: x.year)
    df['collection_date_month'] = df['collection_date'].apply(lambda x: x.month)
    df['collection_date_day'] = df['collection_date'].apply(lambda x: x.day)

    # delta
    df['delta'] = (df['collection_date'] - df['publishedAt']).apply(lambda x: x.days)
    df['log_delta'] = np.log(df['delta'])
    df['sqrt_delta'] = np.sqrt(df['delta'])
    df['pow_delta'] = pow(df['delta'], 2)
    df['log_pow_delta'] = pow(np.log(df['delta']), 2)
    df['publishedAt_delta'] = (df['publishedAt'] - df['publishedAt'].min()).apply(lambda x: x.days)
    df['collection_delta'] = (df['collection_date'] - df['collection_date'].min()).apply(lambda x: x.days)
    
    features["cat"] += ['publishedAt_year', 'publishedAt_month', 'publishedAt_day', \
                        'publishedAt_hour', 'publishedAt_minute', 'publishedAt_second', 'publishedAt_dayofweek', \
                        'collection_date_year', 'collection_date_month', 'collection_date_day']
    
    features["num"] += ['delta', 'log_delta', 'sqrt_delta', 'pow_delta', 'log_pow_delta', \
                        'publishedAt_delta', 'collection_delta']
    
    return df, features
    
def create_features2(df, features, cols_groupby, cols_transform, target_func, option):
    # ------------------------------------------
    # 'cols_groupby'ごとの特徴量生成
    # ------------------------------------------   
    
    cols_to_transform = list(set(cols_transform) - set(cols_groupby))

    #cols_to_transform = [c for c in df.columns if ('likes' in c) | ('dislikes' in c) | ('comment_count' in c)]
    for col_base in cols_groupby:
        for col in cols_to_transform:
            for func in target_func:
                df['_'.join(list(map(str, [col_base, col, func])))] = df.groupby(col_base)[col].transform(func)
                features['num'] += ['_'.join(list(map(str, [col_base, col, func])))]
                
                if option["log"]:
                    df['_'.join(list(map(str, [col_base, col, 'log', func])))] = np.log1p(df.groupby(col_base)[col].transform(func))
                    features['num'] += ['_'.join(list(map(str, [col_base, col, 'log', func])))]
                if option["sqrt"]:
                    df['_'.join(list(map(str, [col_base, col, "sqrt", func])))] = np.sqrt(df.groupby(col_base)[col].transform(func))
                    features["num"] += ['_'.join(list(map(str, [col_base, col, "sqrt", func])))]
                if option["sqrt_log"]:
                    df['_'.join(list(map(str, [col_base, col, "sqrt_log", func])))] = np.log1p(np.sqrt(df.groupby(col_base)[col].transform(func)))
                    features["num"] += ['_'.join(list(map(str, [col_base, col, "sqrt_log", func])))]
                if option["log_sqrt"]:
                    df['_'.join(list(map(str, [col_base, col, "log_sqrt", func])))] = np.sqrt(np.log1p(df.groupby(col_base)[col].transform(func)))
                    features["num"] += ['_'.join(list(map(str, [col_base, col, "log_sqrt", func])))]
                if option["pow"]:
                    df['_'.join(list(map(str, [col_base, col, "pow", func])))] = pow(df.groupby(col_base)[col].transform(func), 2)
                    features["num"] += ['_'.join(list(map(str, [col_base, col, "pow", func])))]
                if option["pow_log"]:
                    df['_'.join(list(map(str, [col_base, col, 'pow_log', func])))] = 2*np.log1p(df.groupby(col_base)[col].transform(func))
                    features['num'] += ['_'.join(list(map(str, [col_base, col, 'pow_log', func])))]
                if option["log_pow"]:
                    df['_'.join(list(map(str, [col_base, col, 'log_pow', func])))] = pow(np.log1p(df.groupby(col_base)[col].transform(func)), 2)
                    features['num'] += ['_'.join(list(map(str, [col_base, col, 'log_pow', func])))]

    return df, features

def create_features3(df, features, cols_transform, option):
    for col in cols_transform:
        if option["log"]:
            df['_'.join(list(map(str, ['log', col])))] = np.log1p(df[col])
            features["num"] += ['_'.join(list(map(str, ['log', col])))]

        if option["sqrt"]:
            df['_'.join(list(map(str, ['sqrt', col])))] = np.sqrt(df[col])
            features["num"] += ['_'.join(list(map(str, ['sqrt', col])))]

        if option["sqrt_log"]:
            df['_'.join(list(map(str, ['sqrt', 'log', col])))] = np.log1p(np.sqrt(df[col]))
            features["num"] += ['_'.join(list(map(str, ['sqrt', 'log', col])))]
        
        if option["log_sqrt"]:
            df['_'.join(list(map(str, ['log', 'sqrt', col])))] = np.sqrt(np.log1p(df[col]))
            features["num"] += ['_'.join(list(map(str, ['log', 'sqrt', col])))]
        
        if option["pow"]:
            df['_'.join(list(map(str, ['pow', col])))] = pow(df[col], 2)
            features["num"] += ['_'.join(list(map(str, ['pow', col])))]
        
        if option["pow_log"]:
            df['_'.join(list(map(str, ['pow', 'log', col])))] = np.log1p(pow(df[col], 2))
            features["num"] += ['_'.join(list(map(str, ['pow', 'log', col])))]
        
        if option["log_pow"]:
            df['_'.join(list(map(str, ['log', 'pow', col])))] = pow(np.log1p(df[col]), 2)
            features["num"] += ['_'.join(list(map(str, ['log', 'pow', col])))]
        
    return df, features

def create_features4(df, features, option):
    if (not option["title"]) and (not option["tags"]) and (not option["description"]):
                return df, features
        
    tokenizer = Tokenizer()
    feats_increased = []
   
    #title_words = ["video", "official", "music", "公式"]
    title_words = ["video", "official"]
    #tags_words = ["music", "video", "official", "song", "remastered", "vevo", "lyric", "rock", "you", "pop", "live", "queen"]
    tags_words = ["music", "video", "official"]
    #desc_words= ["http", "www", "smarturl", "super", "simple", "video", "music", "facebook", "youtube", "twitter", "official", "instagram"]
    desc_words= ["com", "http"]
    
    for j, (title_sentence, tags_sentence, desc_sentence) in enumerate(tqdm(zip(df['title'].str.lower(), df['tags'].str.lower(), df['description'].str.lower()))):
        if option["title"]:
            title_text = " ".join(tokenizer.tokenize(title_sentence, wakati=True))
            for word in title_words:
                if (word in title_text) | (f"{word}s" in title_text):
                    #df['likes'][df.index==j] = df['likes'].iloc[j]*1.2
                    #df['dislikes'][df.index==j] = df['dislikes'].iloc[j]*1.2
                    #df['comment_count'][df.index==j] = df['comment_count'].iloc[j]*1.2
                    if word == "公式":
                        word = "official"
                    df.loc[df.index==j, f'title_{word}'] = 1
                    if not f'title_{word}' in feats_increased:
                        feats_increased += [f'title_{word}']
                
        if option["tags"]:
            tags_text = " ".join(tokenizer.tokenize(tags_sentence, wakati=True))
            for word in tags_words:
                if (word in tags_text) | (f"{word}s" in tags_text):
                    #df['likes'][df.index==j] = df['likes'].iloc[j]*1.2
                    #df['dislikes'][df.index==j] = df['dislikes'].iloc[j]*1.2
                    #df['comment_count'][df.index==j] = df['comment_count'].iloc[j]*1.2
                    df.loc[df.index==j, f'tags_{word}'] = 1
                    if not f'tags_{word}' in feats_increased:
                        feats_increased += [f'tags_{word}']

        if option["description"]:
            desc_text = " ".join(tokenizer.tokenize(desc_sentence, wakati=True))
            for word in desc_words:
                if (word in desc_text) | (f"{word}s" in desc_text):
                    #df['likes'][df.index==j] = df['likes'].iloc[j]*1.2
                    #df['dislikes'][df.index==j] = df['dislikes'].iloc[j]*1.2
                    #df['comment_count'][df.index==j] = df['comment_count'].iloc[j]*1.2
                    df.loc[df.index==j, f'desc_{word}'] = 1
                    if not f'desc_{word}' in feats_increased:
                        feats_increased += [f'desc_{word}']

    features["ohe"] += feats_increased
    feats_increased_dict = {k: 0 for k in feats_increased}
    df.fillna(feats_increased_dict, inplace=True)
    df[feats_increased] = df[feats_increased].astype('int')
        
    return df, features

# ラベルエンコーディング

In [None]:
def label_encoder(df, cols_to_encode=[]):
    lbl_enc_columns = cols_to_encode #cat_features + date_features + ohe_features

    # Transforming all the labels of all variables
    label_encoders = [LabelEncoder() for _ in range(len(lbl_enc_columns))]

    for col, column in enumerate(lbl_enc_columns):
        unique_values = pd.Series(df[column].unique())
        unique_values = unique_values[unique_values.notnull()]
        label_encoders[col].fit(unique_values)
        df.loc[df[column].notnull(), column] = label_encoders[col].transform(df.loc[df[column].notnull(), column])

    return df

# 標準化

In [None]:
def standardization(df, fnc_name="Standard", cols_to_std=[]):
    fnc_scaler = {"Standard": StandardScaler(copy=True, with_mean=True, with_std=True),
                  "MinMax": MinMaxScaler(feature_range=(-1, 1), copy=True),
                  "MaxAbs": MaxAbsScaler(copy=True),
                  "Normalize": Normalizer(norm='l2'),
                  "Robust": RobustScaler(with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True),
                  "Quantile": QuantileTransformer(n_quantiles=1000, output_distribution='normal', ignore_implicit_zeros=False, \
                                                  subsample=100000, random_state=None, copy=True),
                  #"box-cox": PowerTransformer(method='box-cox'),
                  "yeo": PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
                 }
    scaler = fnc_scaler[fnc_name]
    df[cols_to_std] = scaler.fit_transform(df[cols_to_std])

    return df

# featuresの削除

In [None]:
def drop_features(df, features, features_to_drop):
    cols_to_drop = features_to_drop["num"]+features_to_drop["cat"]+features_to_drop["date"]+features_to_drop["ohe"]
    df.drop(cols_to_drop, axis=1, inplace=True)

    for col in features_to_drop["num"]:
        features["num"].remove(col)

    for col in features_to_drop["cat"]:
        features["cat"].remove(col)

    for col in features_to_drop["date"]:
        features["date"].remove(col)

    for col in features_to_drop["ohe"]:
        features["ohe"].remove(col)

    return df, features


# 学習/予測用データの準備 (#1)

In [None]:
def check_features(df, features):
    print("-"*40)
    print(f"実特徴量数: {len(df.columns)} / 計算上の特徴量数: {len(features['num'])+len(features['cat'])+len(features['date'])+len(features['ohe'])}")
    print("-"*40)
    print(f"(内訳) num_features: {len(features['num'])}, cat_features: {len(features['cat'])}, date_features: {len(features['date'])}, ohe_features: {len(features['ohe'])}")
    print("-"*40)

    return

In [None]:
%%time

seed_everything(seed=47)

df = pd.concat([train, test]).reset_index(drop=True)

df.loc[df['categoryId']==43, 'categoryId'] = 30

# 特徴量の分類
features = {"cat": ['video_id', 'title', 'channelId', 'channelTitle', 'categoryId', 'tags', 'thumbnail_link', 'description', 'comments_disabled', 'ratings_disabled'],
            "date": ['publishedAt', 'collection_date'],
            "num": ['likes', 'dislikes', 'comment_count'],
            "ohe": []
           }

# ラベルエンコーディング
print("ラベルエンコーディング")
df = label_encoder(df, cols_to_encode=['categoryId'])

# 特徴量生成
print("特徴量生成")
df, features = create_features(df, features)

# 特徴量生成
print("特徴量生成4")
df, features = create_features4(df, features, option={"title": False, "tags": False, "description": False})

# ラベルエンコーディング
print("ラベルエンコーディング")
df = label_encoder(df, cols_to_encode=features["cat"]+features["date"])
    
# 特徴量生成2
print("特徴量生成2 categoryId - likes/dislikes/comment_count/channelId")
df, features = create_features2(df, features,
                                cols_groupby=['categoryId', 'comments_disabled', 'ratings_disabled', 'n_tags', 'len_description'],
                                cols_transform=['likes', 'dislikes', 'comment_count', 'channelId'],
                                                #'likes_cut', 'dislikes_cut', 'comment_count_cut',
                                                #'likes_qcut', 'dislikes_qcut', 'comment_count_qcut'],
                                target_func=['max', 'min', 'mean'],
                                option={"log": False,\
                                        "sqrt": False, "sqrt_log": False, "log_sqrt": False,\
                                        "pow": False, "pow_log": False, "log_pow": False})

print("特徴量生成2 channelId - all features")
df, features = create_features2(df, features,
                                cols_groupby=['channelId'],
                                cols_transform=list(set(features['num'] + features['cat'] + features['ohe'])),
                                target_func=['max', 'min', 'mean'],
                                option={"log": False, \
                                        "sqrt": False, "sqrt_log": False, "log_sqrt": False,\
                                        "pow": False, "pow_log": False, "log_pow": False})
# 特徴量生成3
print("特徴量生成3")
df, features = create_features3(df, features,
                                cols_transform=[c for c in df.columns if ('likes' in c)|('dislikes' in  c)|('comment_count' in c)],
                                option={"log": False,\
                                        "sqrt": True, "sqrt_log": False, "log_sqrt": False,\
                                        "pow": True, "pow_log": False, "log_pow": True})
# 特徴量数のチェック
print("特徴量のチェック")
check_features(df, features)

# 特徴量削除
print("特徴量削除")
feats_to_drop = {"cat": ['video_id', 'channelId', 'title', 'channelTitle', 'tags', 'thumbnail_link', 'description'],
                 "date": ['publishedAt', 'collection_date'],
                 "ohe": [],
                 "num": []}

#-----------------------------------------------------------------------------------------------------------
# CV: 0.725691, LB: 0.723
#-----------------------------------------------------------------------------------------------------------
feats_to_drop["cat"] += ['publishedAt_second', 'publishedAt_minute', 'publishedAt_hour', 'publishedAt_day', 'publishedAt_dayofweek']
df, features = drop_features(df, features, feats_to_drop)

# nunique()==1の特徴量を削除
for col in df.loc[:, df.nunique()==1].columns:
    features["num"].remove(col)
df.drop(df.loc[:, df.nunique()==1].columns, axis=1, inplace=True)

# データ型変換
print("データ型変換")
#df[features["num"]] = df[features["num"]].astype('float32')
df = df.astype('float')
df[features["cat"]] = df[features["cat"]].astype('int')
df[features["ohe"]] = df[features["ohe"]].astype('int')

print("標準化")
df = standardization(df, fnc_name="MinMax", cols_to_std=features["num"])

# 学習、予測データ分割
X_train = df.iloc[:y.shape[0], :].reset_index(drop=True)
X_test  = df.iloc[y.shape[0]:, :].reset_index(drop=True)

# 特徴量数のチェック
print("特徴量のチェック")
check_features(df, features)

# 欠損値、無限大/無限小値有無のチェック
print(f"学習データ中の欠損値数: {X_train.isnull().sum().sum()}")
print(f"学習データ中の無限値数: {np.count_nonzero(np.isinf(X_train))}")
print(f"予測データ中の欠損値数: {X_test.isnull().sum().sum()}")
print(f"予測データ中の無現値数: {np.count_nonzero(np.isinf(X_test))}")


In [None]:
%%time

display(description(X_train))
display(description(X_test))

# データ分割
### comments_disabled(True/False)によるデータ分割
---

In [None]:
%%time

X_train_false = X_train[(train['comments_disabled']==False)]
y_false       = y[(train['comments_disabled']==False)]
X_test_false  = X_test[test['comments_disabled']==False]

X_train_false_index = X_train_false.index
X_test_false_index  = X_test_false.index

X_train_true = X_train[train['comments_disabled']==True]
y_true       = y[train['comments_disabled']==True]
X_test_true  = X_test[test['comments_disabled']==True]

X_train_true_index = X_train_true.index
X_test_true_index  = X_test_true.index


# Single Model
---

In [None]:
N_SPLITS = 10
SEED = 47
LEARNING_RATE = 5e-4
BATCH_SIZE = 32
PATIENCE = 20

In [None]:
def create_callbacks():
    callbacks = []
    
    callbacks.append(EarlyStopping(monitor='val_loss',
                                   min_delta=0,
                                   patience=PATIENCE,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True))
    
    # Update the learning rate every epoch
    callbacks.append(LearningRateScheduler(lambda x: LEARNING_RATE * 0.95 ** x))
    
    return callbacks

## nn model def.

In [None]:
def nn(lr, input_shape):

    model = Sequential([
        Dense(2 ** 8, activation='relu', input_dim=input_shape, kernel_initializer='he_normal'),
        Dense(2 ** 7, activation='relu', kernel_initializer='he_normal'),
        Dense(2 ** 6, activation='relu', kernel_initializer='he_normal'),
        Dense(2 ** 5, activation='relu', kernel_initializer='he_normal'),
        Dense(2 ** 4, activation='relu', kernel_initializer='he_normal'),
        Dense(2 ** 3, activation='relu', kernel_initializer='he_normal'),

        Dense(2 ** 3, activation='relu', kernel_initializer='he_normal'),
        Dense(1)
    ])

    # COMPILE WITH ADAM OPTIMIZER AND CROSS ENTROPY COST    
    adam_opt = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=True)
    nadam_opt = Nadam(learning_rate=lr, beta_1=0.9, beta_2=0.999)
    ladam_opt = tfa.optimizers.LazyAdam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
    adamw_opt = tfa.optimizers.AdamW(learning_rate=LEARNING_RATE, weight_decay=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)
    rmsprop_opt = RMSprop(learning_rate=lr, rho=0.9)
    sgd_opt = SGD(learning_rate=lr, momentum=0.0, nesterov=False)
    sgd_opt = SGD(learning_rate=lr, decay=1e-4, momentum=0.9, nesterov=True)

    model.compile(optimizer=nadam_opt,loss='mean_squared_error', metrics=tf.keras.metrics.RootMeanSquaredError())

    return model

## nn2 model def.

In [None]:
def nn2(lr, input_shape):

    model = Sequential([
        Dense(2 ** 8, activation='relu', input_dim=input_shape, kernel_initializer='he_normal'),
        Dense(2 ** 7, activation='relu', kernel_initializer='he_normal'),
        Dense(2 ** 6, activation='relu', kernel_initializer='he_normal'),
        
        Dense(2 ** 3, activation='relu', kernel_initializer='he_normal'),
        Dense(1)
    ])

    # COMPILE WITH ADAM OPTIMIZER AND CROSS ENTROPY COST    
    adam_opt = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, amsgrad=True)
    nadam_opt = Nadam(learning_rate=lr, beta_1=0.9, beta_2=0.999)
    ladam_opt = tfa.optimizers.LazyAdam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False)
    adamw_opt = tfa.optimizers.AdamW(learning_rate=LEARNING_RATE, weight_decay=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)
    rmsprop_opt = RMSprop(learning_rate=lr, rho=0.9)
    sgd_opt = SGD(learning_rate=lr, momentum=0.0, nesterov=False)
    sgd_opt = SGD(learning_rate=lr, decay=1e-4, momentum=0.9, nesterov=True)

    model.compile(optimizer=nadam_opt,loss='mean_squared_error', metrics=tf.keras.metrics.RootMeanSquaredError())

    return model

## nn model学習、予測

In [None]:
%%time

history, history_false = [], []
score, score_false = [], []
pred_train, pred_train_false = np.zeros((X_train.shape[0])), np.zeros((X_train_false.shape[0]))
pred_full_test, pred_full_test_false = 0, 0

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold_id, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_train, (train.comments_disabled)))):
    print("*"*80)
    print(f"Started TF learning(1) fold:{fold_id+1} / {N_SPLITS}")

    # 全データで学習、予測
    model = nn(lr=LEARNING_RATE, input_shape=X_train.shape[1])
    callbacks = create_callbacks()

    tr_X, val_X = X_train.iloc[train_idx].copy(), X_train.iloc[val_idx].copy()
    tr_y, val_y = y.iloc[train_idx], y.iloc[val_idx]
        
    history.append(model.fit(tr_X, tr_y, batch_size=BATCH_SIZE,
                             epochs=300,
                             verbose=2,
                             validation_data=(val_X, val_y),
                             callbacks=callbacks))
    
    pred_train[val_idx] = model.predict(val_X).reshape(-1)
    score.append(model.evaluate(val_X, val_y, batch_size=BATCH_SIZE, verbose=0, return_dict=True))
    pred_full_test = pred_full_test + model.predict(X_test)
    
    RMSLE = mean_squared_error(y[val_idx], pred_train[val_idx], squared=False)
    print(f"RMSLE={RMSLE}")

RMSLE_overall = mean_squared_error(y, pred_train, squared=False)
print(f"Overall RMSLE={RMSLE_overall}")

# Make submission
print("Saving submission file")
submission = pd.DataFrame({'id': test_id, 'y': np.expm1((pred_full_test/N_SPLITS).reshape(-1))})
submission.to_csv(f"./{out_dir}/submission_NN_SEED{SEED}_FOLDS{N_SPLITS}_CV{RMSLE_overall:.6f}.csv", index=False)

with open(f"./{out_dir}/NN_train_SEED{SEED}_FOLDS{N_SPLITS}.pickle", 'wb') as f:
    pickle.dump(pred_train, f)
with open(f"./{out_dir}/NN_test_SEED{SEED}_FOLDS{N_SPLITS}.pickle", 'wb') as f:
    pickle.dump((pred_full_test/N_SPLITS).reshape(-1), f)


In [None]:
%%time

for j in range(N_SPLITS):
    hist = pd.DataFrame(history[j].history)
    hist['epoch'] = history[j].epoch
    display(hist.tail(PATIENCE+2))


In [None]:
%%time

for j in range(N_SPLITS):
    plot_history(history[j])

## nn2 model学習、予測

In [None]:
%%time

history, history_false = [], []
score, score_false = [], []
pred_train, pred_train_false = np.zeros((X_train.shape[0])), np.zeros((X_train_false.shape[0]))
pred_full_test, pred_full_test_false = 0, 0

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold_id, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_train, (train.comments_disabled)))):
    print("*"*80)
    print(f"Started TF learning(1) fold:{fold_id+1} / {N_SPLITS}")

    # 全データで学習、予測
    model = nn(lr=LEARNING_RATE, input_shape=X_train.shape[1])
    callbacks = create_callbacks()

    tr_X, val_X = X_train.iloc[train_idx].copy(), X_train.iloc[val_idx].copy()
    tr_y, val_y = y.iloc[train_idx], y.iloc[val_idx]
        
    history.append(model.fit(tr_X, tr_y, batch_size=BATCH_SIZE,
                             epochs=300,
                             verbose=2,
                             validation_data=(val_X, val_y),
                             callbacks=callbacks))
    
    pred_train[val_idx] = model.predict(val_X).reshape(-1)
    score.append(model.evaluate(val_X, val_y, batch_size=BATCH_SIZE, verbose=0, return_dict=True))
    pred_full_test = pred_full_test + model.predict(X_test)
    
    RMSLE = mean_squared_error(y[val_idx], pred_train[val_idx], squared=False)
    print(f"RMSLE={RMSLE}")

RMSLE_overall = mean_squared_error(y, pred_train, squared=False)
print(f"Overall RMSLE={RMSLE_overall}")

# Make submission
print("Saving submission file")
submission = pd.DataFrame({'id': test_id, 'y': np.expm1((pred_full_test/N_SPLITS).reshape(-1))})
submission.to_csv(f"./{out_dir}/submission_NN2_SEED{SEED}_FOLDS{N_SPLITS}_CV{RMSLE_overall:.6f}.csv", index=False)

with open(f"./{out_dir}/NN2_train_SEED{SEED}_FOLDS{N_SPLITS}.pickle", 'wb') as f:
    pickle.dump(pred_train, f)
with open(f"./{out_dir}/NN2_test_SEED{SEED}_FOLDS{N_SPLITS}.pickle", 'wb') as f:
    pickle.dump((pred_full_test/N_SPLITS).reshape(-1), f)


In [None]:
%%time

for j in range(N_SPLITS):
    hist = pd.DataFrame(history[j].history)
    hist['epoch'] = history[j].epoch
    display(hist.tail(PATIENCE+2))


In [None]:
%%time

for j in range(N_SPLITS):
    plot_history(history[j])

In [None]:
print(datetime.datetime.now()-start)