### Mercari Price 
The files consist of a list of product listings. These files are tab-delimited.

Fields:
- train_id or test_id - the id of the listing

- name - the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid  leakage. These removed prices are represented as [rm]

- item_condition_id - the condition of the items provided by the seller

- category_name - category of the listing

- brand_name

- price - the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict.

- shipping - 1 if shipping fee is paid by seller and 0 by buyer

- item_description - the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid leakage. These removed prices are represented as [rm]

In [None]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
import nltk
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from tqdm._tqdm_notebook import tqdm_notebook

import os
import itertools

import matplotlib.pyplot as plt
import itertools
from collections import Counter
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from tensorflow.keras.layers import Dense, Input, Embedding, Concatenate, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model 

In [None]:
tqdm_notebook.pandas()

### Dataset download

In [None]:
# check if in colab
RunningInCOLAB = 'google.colab' in str(get_ipython())
if RunningInCOLAB:
    print("Running in colab")
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)
    colab_root = '/content/drive'
    root_dir = "/content/gdrive/My Drive/"
    base_dir = root_dir + 'project-mercari-price/'
    if not os.path.isdir(base_dir):
        os.mkdir(base_dir)
else:
    root_dir= os.getcwd()
    base_dir = root_dir
    
os.chdir(base_dir)

In [None]:
dataset_downloaded_path = os.path.join(base_dir, "dataset_downloaded.ignore")
dataset_downloaded = os.path.isfile(dataset_downloaded_path)
dataset_downloaded

In [None]:
if not dataset_downloaded:
  # install kaggle to download dataset
  ! pip install kaggle python-dotenv

In [None]:
# set to True if you want to save kaggle credentials into a .env file
persist_credentials = False

if not dataset_downloaded:
  # create .env file containing KAGGLE_USER and KAGGLE_KEY
    kaggle_env = os.path.join(base_dir, '.env')
    if not os.path.isfile(kaggle_env):
        with open(kaggle_env, 'w') as envfile:
            kaggle_user = input("Insert kaggle username")
            kaggle_key = input("Insert kaggle key; generate one from kaggle account")
        if persist_credentials:
            envfile.write(f"""
            KAGGLE_USERNAME={kaggle_user}
            KAGGLE_KEY={kaggle_key}
            """)

        # set env vars
        os.environ["KAGGLE_USERNAME"] = kaggle_user
        os.environ["KAGGLE_KEY"] = kaggle_key

        del kaggle_user
        del kaggle_key

In [None]:
if not dataset_downloaded:
  # loading env vars if .env file exists
    if os.path.isfile(kaggle_env):
        from dotenv import load_dotenv
        load_dotenv(dotenv_path=kaggle_env)
    print(os.environ.get("KAGGLE_USERNAME"))

In [None]:
if not dataset_downloaded:
    # download and extract dataset
    ! kaggle competitions download -c mercari-price-suggestion-challenge

    # create file so that we know we already downloaded
    with open(dataset_downloaded_path, 'w') as dd_file:
        dataset_downloaded = True
        dd_file.write("")

    print('cwd: ', os.getcwd())
    
    os.listdir()

In [None]:
if not dataset_downloaded:
    ! 7z x train.tsv.7z
    ! 7z x test.tsv.7z

In [None]:
os.listdir()

## Load dataset

In [None]:
dtypes={
    'name': 'string',
    'item_condition_id': 'byte',
    'category_name': 'string',
    'brand_name': 'string',
    'price': 'float',
    'shipping': 'boolean',
    'item_description': 'string'
}
data = pd.read_csv("train.tsv", sep='\t', dtype=dtypes)
data = data.drop(columns=["train_id"])
data

In [None]:
data.dtypes

In [None]:
data.shape

In [None]:
for column in data.columns:
    print("number of null value in {} : {}".format(column,data[column].isnull().sum()))

In [None]:
data = data[data["item_description"].notna()]
data["brand_name"] = data["brand_name"].fillna(value="NA")
data["category_name"] = data["category_name"].fillna(value="NA")
# see warnings -> inplace?
data.shape

In [None]:
def flat_list(l):
    return  [item for sublist in l for item in sublist]


def plot_common_tokens(tokens, title, n=20):
    sentences = (list(itertools.chain(tokens)))
    flat_sentences = flat_list(sentences)
    counts = Counter(flat_sentences)
    #print(counts.most_common(30))
    common_words = [word[0] for word in counts.most_common(n)]
    common_counts = [word[1] for word in counts.most_common(n)]
    fig = plt.figure(figsize=(18,6))
    sns.barplot(x=common_words, y=common_counts)
    plt.title(title)
    plt.show()


In [None]:
#plot_common_tokens(data["item_description_tokens"], "Most Common Tokens from Descriptions")

In [None]:
X = data[['name', 'item_condition_id', 'category_name', 'brand_name',
       'shipping','item_description']]
y = data['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=1000)

In [None]:
def getTokens(data, tokenizer=None, dataFit=None, num_words=5000):
    if dataFit is None:
        dataFit = data
        
    if tokenizer is None:
        tokenizer = Tokenizer(num_words=num_words)
        tokenizer.fit_on_texts(dataFit)
    
    tokens = tokenizer.texts_to_sequences(data)
    return tokens, tokenizer

In [None]:
tokenizers = {}

In [None]:
X_train["item_description_t"], tokenizers["item_description"] = getTokens(X_train["item_description"])
X_train["name_t"], tokenizers["name"] = getTokens(X_train["name"])
#X_train["brand_name_t"], tokenizers["brand_name"] = getTokens(X_train["brand_name"])
#X_train["category_name_t"], tokenizers["category_name"] = getTokens(X_train["category_name"])

In [None]:
tokenizers

In [None]:
X_train["item_description_t"]

In [None]:
cat_le = preprocessing.LabelEncoder()
cat_le.fit(X_train["category_name"])

X_train["category_name_l"] = cat_le.transform(X_train["category_name"])

In [None]:
brand_le = preprocessing.LabelEncoder()
brand_le.fit(X_train["brand_name"])

X_train["brand_name_l"] = brand_le.transform(X_train["brand_name"])

In [None]:
len(tokenizers["item_description"].word_index)

In [None]:
# unknown values !!! now -> crash

In [None]:
X_train.loc[995196, ["name", "name_t"]]

In [None]:
X_train.head()

In [None]:
desc_vocab_size = len(tokenizers['item_description'].word_index) + 1
print(desc_vocab_size)

name_vocab_size = len(tokenizers['name'].word_index) + 1
print(name_vocab_size)

In [None]:
X_train["item_description_t"]

In [None]:
test = X_train["item_description_t"]
padded = pad_sequences(test, padding='post', maxlen=20)

In [None]:

maxlen = 100

inputDesc_train= pad_sequences(X_train["item_description_t"],
                                                  padding='post', maxlen=maxlen)

In [None]:
inputName_train = pad_sequences(X_train["name_t"], padding='post', maxlen=20)

In [None]:
import tensorflow.keras.backend as K
msle = tf.keras.losses.MeanSquaredLogarithmicError()

def root_mean_squared_logarithmic_error(y_true, y_pred):
    return K.sqrt(msle(y_true, y_pred))


In [None]:
y_train

In [None]:
desc_vocab_size

In [None]:
X_train[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]]

In [None]:
name_vocab_size = 880000
desc_vocab_size=1500000

In [None]:
def getModel():
    inputA = Input(shape=(4,))
    Ad = Dense(4, activation='relu')(inputA)
    
    inputName = Input(shape=(20,))
    Ne = Embedding(input_dim=name_vocab_size, output_dim=16, input_length=20)(inputName)
    Nf = Flatten()(Ne)
    Nd = Dense(8, activation='relu')(Nf)
    
    inputDesc = Input(shape=(100,))
    De = Embedding(input_dim=desc_vocab_size, output_dim=32, input_length=100)(inputDesc)
    Df = Flatten()(De)
    Dd = Dense(16, activation='relu')(Df)
    
    concat = Concatenate()([Ad, Nd, Dd])
    
    x = Dense(32, activation='relu')(concat)
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[inputA, inputName, inputDesc], outputs=x)
    
    return model
    

In [None]:
model = getModel()
model.summary()

In [None]:
plot_model(model)

In [None]:
model.compile(optimizer='adam',
              loss='mse',
              metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])

In [None]:
inputA_train = X_train[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')

In [None]:
inputA_train.astype('int32')

In [None]:
history = model.fit(x=[inputA_train, inputName_train, inputDesc_train], y=y_train,
                    epochs=5,
                    verbose=True,
                    #validation_data=(X_validation, y_validation),
                    batch_size=256)