In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import plotly.express as px 
import seaborn as sns 
import os 
import requests, json 
import cv2 
from PIL import Image 
import math 
from tqdm import tqdm 
from typing import Dict 
import wandb 
from wandb.keras import WandbCallback
from kaggle_secrets import UserSecretsClient 
import time 
from statistics import mean
import random 
from wordcloud import WordCloud
import gc 

import re 
import nltk 
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import StratifiedKFold 

import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.layers import Input, Dense, Embedding, GRU, Flatten, Dropout, concatenate, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import LearningRateScheduler, ReduceLROnPlateau

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
config = dict(
    competition = "mercari", 
    train = True, 
    type = "train", 
    inferece = True, 
    debug = False,
    model_name = "rnn", 
    device = "cpu", 
    
    n_fold = 4, 
    seed = 42, 
    batch_size = 20000, 
    epoch = 10, 
    
)

In [None]:
# user_secrets = UserSecretsClient()
# url = user_secrets.get_secret("WEB_HOOK_URL") 

# user_secrets = UserSecretsClient()
# api = user_secrets.get_secret("wandb_api")


# def setup_db(fold):
#     if fold == 0:
#         wandb.login(key=api)
#     run = wandb.init(
#         project = config["competition"], 
#         name = config["model_name"], 
#         config = config, 
#         group = config["model_name"], 
#         job_type = config["type"]
#     )

# def slack(txt):
#     requests.post(url, data=json.dumps({
#         "username": "kaggle", 
#         "text": txt 
#     }))

In [None]:
def reduce_mem_usage(train_data):
    start_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in train_data.columns:
        col_type = train_data[col].dtype

        if col_type != object:
            c_min = train_data[col].min()
            c_max = train_data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    train_data[col] = train_data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    train_data[col] = train_data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    train_data[col] = train_data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    train_data[col] = train_data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    train_data[col] = train_data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    train_data[col] = train_data[col].astype(np.float32)
                else:
                    train_data[col] = train_data[col].astype(np.float64)
        else:
            train_data[col] = train_data[col].astype('category')

    end_mem = train_data.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return train_data

In [None]:
%%time 

if config["debug"]:
    train = pd.read_csv("train.tsv", sep="\t", nrows=8000)    
    test = pd.read_csv("test_stg2.tsv", sep="\t", nrows=8000)
    
else:
    train = pd.read_csv("train.tsv", sep="\t")    
    test = pd.read_csv("test_stg2.tsv", sep="\t")
    

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train.head()

# feature engineering. 

In [None]:
def get_top(x):
    if type(x) != list:
        return ""
    else:
        return x[0]
def get_sub(x):
    if type(x) != list:
        return ""
    else:
        return x[1]
def get_item(x):
    if type(x) != list:
        return ""
    else:
        return x[2]
    
    
nltk.download('stopwords')
stop_words = stopwords.words('english')
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def cln_txt(x):
    new = []
    for t in x.split():
        if t in stop_words: continue
        t = t.lower()
        t = non_alphanums.sub("", t)
        if t == "": continue 
        new.append(t)
    return " ".join(new)

    
def add_feature(train, test):
    last_train_shpae = train.shape[0]
    df = pd.concat([train.drop(["price", "train_id"], axis=1), test.drop("test_id", axis=1)])
    y = train["price"]
    del train, test 
    gc.collect()
    
    df["name"] = df.name.astype("category")
    df["category_name"] = df.category_name.astype("category")
    df["brand_name"] = df.brand_name.astype("category")
    df["item_description"] = df.item_description.astype("category")
    
    # category name 
    df["category_isnan"] = df.category_name.isnull()
    df["category_isnan"] = df.category_isnan.apply(lambda x: 1 if x is True else 0)
    df["category_len"] =  df.category_name.apply(lambda x: x.split("/")) 
    
    df["top"] = df.category_len.apply(get_top)
    df["sub"] = df.category_len.apply(get_sub)
    df["item"] = df.category_len.apply(get_item)
    df["category_len"] = df.category_len.apply(lambda x: 0 if type(x) != list else len(x))
    
    df.drop(["category_name"], axis=1, inplace=True)
    
    # brand_name 
    df["no_brand"] = df.brand_name.isnull()
    df["no_brand"] = df.no_brand.apply(lambda x: 1 if x is True else 0)
    
    # item_discription 
    df["not_discription"] = df.item_description.apply(lambda x: 1 if x == "No description yet" else 0)
    df["item_description_dummy"] = df.item_description.apply(lambda x: "" if x == "No description yet" else x)
    df["discription_len"] = df.item_description_dummy.apply(lambda x: len(x.split()))
    
    df.drop("item_description_dummy", axis=1, inplace=True)
    
    # name 
    df["name_len"] = df.name.apply(lambda x: len(x.split()))
    
    #shipping_confident 
    df["condition_shipping"] = df["item_condition_id"].astype(str) + "_" + df["shipping"].astype(str)
    
    # label encoder 
    cate_col = [ "top", "sub", "item", "condition_shipping"]
    for col in cate_col:
        la = LabelEncoder()
        df[col] = la.fit_transform(df[col])
        
    # nlp 
    df["name"] = df.name.apply(cln_txt)
    df["brand_name"] = df.brand_name.apply(cln_txt)
    df["item_description"] = df.item_description.apply(cln_txt)
    
#     df["name"] = df.name.astype("category")
    df["brand_name"] = df.brand_name.astype("category")
#     df["item_description"] = df.item_description.astype("category")
    
#     df["name"] = df.name.cat.codes 
    df["brand_name"] = df.brand_name.cat.codes 
#     df["item_description"] = df.item_description.cat.codes 
    
    train = df.iloc[:last_train_shpae]
    train["price"] = y 
    test = df.iloc[last_train_shpae:]
    
    del df, y 
    gc.collect()
        
    return train, test 

In [None]:
%%time

train, test = add_feature(train, test)

# clensing NLP 

In [None]:

'''
create vocabulary class Lang.
name and discription transform tokenizer. 

'''

class Lang(object):
    def __init__(self, name):
        self.name = name 
        self.word2index = {"<CLS>": 1, "<EOS>": 2, "<UNK>": 3, "<PAD>": 0}
        self.index2word = {}
        self.word2count = {}
        
        self._create_vocab()
        
    def _create_vocab(self):
        for name in self.name:
            if type(name) != str: continue 
            for txt in name.split():
                if txt not in self.word2index:
                    self.word2index[txt] = len(self.word2index)
                    self.word2count[txt] = 1 
                else:
                    self.word2count[txt] += 1 
        self.index2word = {v: k for k, v in self.word2index.items()}
    
    def fit_transform(self, doc):
        token_list = []
        token_list.append(self.word2index["<CLS>"])
        if type(doc) != str:
            return token_list.append(self.word2index["<EOS>"])
        for txt in doc.split():
            if txt in self.word2index:
                token = self.word2index[txt]
                token_list.append(token)
            else:
                token_list.append(self.word2index["<UNK>"])
        token_list.append(self.word2index["<EOS>"])
        return token_list 
    
    
def transform_nlp(train, test):
    last_train_shape = train.shape[0]
    # name 
    df_name = pd.concat([train[["name"]], test[["name"]]])
    lang_name = Lang(df_name["name"].astype(str).to_list())
    
    df_name["name"] = df_name.name.apply(lang_name.fit_transform)
    
    train["name"] = df_name.iloc[:last_train_shape]["name"]
    test["name"] = df_name.iloc[last_train_shape:]["name"]
    vocab_name = len(lang_name.word2index)
    cnt_name = lang_name.word2count
    del df_name, lang_name 
    gc.collect()
    print(1)
    
    # discription 
    df_dis = pd.concat([train[["item_description"]], test[["item_description"]]])
    lang_dis = Lang(df_dis["item_description"].astype(str).to_list())
    
    df_dis["item_description"] = df_dis.item_description.apply(lang_dis.fit_transform)
    
    train["item_description"] = df_dis.iloc[:last_train_shape]["item_description"]
    test["item_description"] = df_dis.iloc[last_train_shape:]["item_description"]
    vocab_dis = len(lang_dis.word2index)
    cnt_dis = lang_dis.word2count
    del df_dis, lang_dis
    gc.collect()
    print(2)
    
    return train, test, vocab_name, vocab_dis, cnt_name, cnt_dis


In [None]:
%%time 

train, test, vocab_name, vocab_dis, cnt_name, cnt_dis = transform_nlp(train, test)

In [None]:
train.head()

In [None]:
train.isnull().sum().to_frame()

In [None]:
def show_cloud(name, desc):
    name = WordCloud(width=1500, height=1100, background_color="white", max_words=15).generate_from_frequencies(name)
    desc = WordCloud(width=1500, height=1100, background_color="white", max_words=15).generate_from_frequencies(desc)
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    ax = axes.ravel()
    
    ax[0].imshow(name)
    ax[0].set_title("name vocab")
    
    ax[1].imshow(desc)
    ax[1].set_title("discription vocab")
    
    plt.tight_layout()
    
show_cloud(cnt_name, cnt_dis)

del cnt_name, cnt_dis 
gc.collect()

# Dataset

In [None]:
def get_keras_data(dataset: pd.DataFrame) -> Dict[str, np.ndarray]:
    feature_col = dataset.drop(["name", "item_description"], axis=1).columns
    x = {
        "name": pad_sequences(dataset.name, maxlen=10),
        "item_desc": pad_sequences(dataset.item_description, maxlen=128),
        "feature": np.array(dataset[feature_col].values)
    }
    return x 

x = get_keras_data(train.drop("price", axis=1))
print(x)

# Model 

In [None]:

'''
文章はトークン単位で埋め込み空間に拡張してRNNにより時系列の最終隠れ層を取得する。
その他は線形代数による通常変換。
のちに２つの出力を結合して最終出力とするモデル。

参照: https://www.kaggle.com/knowledgegrappler/a-simple-nn-solution-with-keras-0-48611-pl
'''


params = {
    "hidden_dim": 64,
    "name_shape": x["name"].shape[1], 
    "desc_shape": x["item_desc"].shape[1],
    "feature_shape": x["feature"].shape[1]
}


# tensorflow
def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

# ndarray 
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5


def build_model(
    name_shape=params["name_shape"],
    desc_shape=params["desc_shape"],
    feature_shape=params["feature_shape"]
):
    name = Input(shape=[name_shape], name="name")
    desc = Input(shape=[desc_shape], name="item_desc")
    feature = Input(shape=[feature_shape], name="feature")
    
    emb_name = Embedding(vocab_name, params["hidden_dim"])(name)
    emb_desc = Embedding(vocab_dis, params["hidden_dim"])(desc)
    
    hs_name = GRU(8)(emb_name)
    hs_desc = GRU(16)(emb_desc)
    hs_feature = Dense(32, activation="relu")(feature)
    
    x = concatenate([
        Flatten()(hs_name),
        Flatten()(hs_desc),
        hs_feature,
    ])
    
    x = Dropout(0.1)(Dense(128, activation="relu")(x))
    x = Dropout(0.1)(Dense(64, activation="relu")(x))
    out = Dense(1)(x)
    model = Model([name, desc, feature], out)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", rmsle_cust])
    return model 

model = build_model()
model.summary()

# Train phase 

In [None]:
if config["debug"] is not True and config["device"] == "tpu":
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)


def get_callbacks(fold):
    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=10, verbose=1)
    es = EarlyStopping(monitor="val_loss", patience=60, verbose=1, 
                           mode="auto", restore_best_weights=True)
    os.makedirs("models", exist_ok=True)
    checkpoint_filepath = f"models/rnn_{str(fold)}.hdf5"
    sv = keras.callbacks.ModelCheckpoint(
            checkpoint_filepath, monitor='val_loss', verbose=1, save_best_only=True,
            save_weights_only=False, mode='auto', save_freq='epoch',
            options=None
    )
#     wb = WandbCallback(log_weights=True)
    return lr, es, sv

def submit(pred, name):
    sub = pd.read_csv("./sample_submission_stg2.csv")
    sub = sub[["test_id"]]
    sub["price"] = pred 
    sub.to_csv("submission.csv", index=False)
    del sub
    gc.collect()
    

def main(train, test):
#     with tpu_strategy.scope():
        predict_val, val_idx, predict_test = [], [], []
        train["rank"] = pd.cut(train.price, bins=10, labels=False)
        kf = StratifiedKFold(n_splits=2 if config["debug"] else config["n_fold"], random_state=config["seed"], shuffle=True)

        for fold, (tr, va) in enumerate(kf.split(train, train["rank"])):
            x_train, x_val = get_keras_data(train.iloc[tr].drop(["rank", "price"], axis=1)), get_keras_data(train.iloc[va].drop(["rank", "price"], axis=1))
            y_train, y_val = train.iloc[tr]["price"], train.iloc[va]["price"]

            x_test = get_keras_data(test)

            model = build_model()
#             run = setup_db(fold)
            lr, es, sv = get_callbacks(fold+1)

            model.fit(x_train, 
                     y_train,
                     validation_data=(x_val, y_val), 
                     epochs=1 if config["debug"] else config["epoch"], 
                     batch_size=12 if config["debug"] else config["batch_size"],
                     callbacks=[lr, es, sv])

            predv = model.predict(x_val).flatten()
            predt = model.predict(x_test).flatten()

            print(f"fold: {fold+1} | rmse: {rmsle(y_val.values.ravel(), predv)}")
            predict_val.append(predv)
            predict_test.append(predt)
            val_idx.append(va)

            del x_train, x_val, x_test, model 
            gc.collect()

        predict_val = np.concatenate(predict_val)
        val_idx = np.concatenate(val_idx)
        val_idx = np.argsort(val_idx)
        predict_val = predict_val[val_idx]

        print("===========================================================")
        print(f"CV SCORE: {rmsle(train.price.values.ravel(), predict_val)}")
        print("===========================================================")

        predict_test_mean = np.mean(predict_test, 0)
#         predict_test_median = np.median(predict_test, 0)

        if config["debug"] is not True:
            submit(predict_test_mean, "mean")
#             submit(predict_test_median, "median")

        del predict_test_mean
        gc.collect()
#         slack("Mercari RNN model Train done.")
        return predict_val 

In [None]:
if __name__ == "__main__":
    main(train, test)