### Mercari Price 
The files consist of a list of product listings. These files are tab-delimited.

Fields:
- train_id or test_id - the id of the listing

- name - the title of the listing. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid  leakage. These removed prices are represented as [rm]

- item_condition_id - the condition of the items provided by the seller

- category_name - category of the listing

- brand_name

- price - the price that the item was sold for. This is the target variable that you will predict. The unit is USD. This column doesn't exist in test.tsv since that is what you will predict.

- shipping - 1 if shipping fee is paid by seller and 0 by buyer

- item_description - the full description of the item. Note that we have cleaned the data to remove text that look like prices (e.g. $20) to avoid leakage. These removed prices are represented as [rm]

## Setup

In [None]:
! pip install pydot graphviz emoji transformers

import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
import nltk
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow.keras.backend as K


from tqdm._tqdm_notebook import tqdm_notebook

import os
import itertools

import matplotlib.pyplot as plt
import itertools
from collections import Counter
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import sklearn

from tensorflow.keras.layers import Dense, Input, Embedding, Concatenate, Flatten, Dropout, LSTM, GlobalMaxPool1D, GRU, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

from nltk.corpus import stopwords
import string

import pickle

from tensorflow.keras.preprocessing.text import text_to_word_sequence 
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import TweetTokenizer

import emoji
import os

from sklearn.feature_extraction.text import CountVectorizer

import transformers
from transformers import DistilBertTokenizer, TFDistilBertModel, pipeline

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')


stop_words = set(stopwords.words('english'))
stop_words.remove("no")

tqdm_notebook.pandas()

In [None]:
msle = tf.keras.losses.MeanSquaredLogarithmicError()

def root_mean_squared_logarithmic_error(y_true, y_pred):
    return K.sqrt(msle(y_true, y_pred))

np.random.seed(1000)

## Colab

In [None]:
RunningInCOLAB = 'google.colab' in str(get_ipython())

# check if in colab
if RunningInCOLAB and not os.path.isdir('/content/gdrive'):
    print("Running in colab")
    from google.colab import drive
    drive.mount('/content/gdrive', force_remount=True)
    colab_root = '/content/drive'
      
if RunningInCOLAB:
    root_dir = "/content/gdrive/My Drive/"
    base_dir = root_dir + 'project-mercari-price/'
    if not os.path.isdir(base_dir):
        os.mkdir(base_dir)
else:
    root_dir= os.getcwd()
    base_dir = root_dir

os.chdir(base_dir)

os.getcwd()

## Dataset download

In [None]:
dataset_downloaded_path = os.path.join(base_dir, "dataset_downloaded.ignore")
dataset_downloaded = os.path.isfile(dataset_downloaded_path)
dataset_downloaded

if not dataset_downloaded:
  # install kaggle to download dataset
  ! pip install kaggle python-dotenv

# set to True if you want to save kaggle credentials into a .env file
persist_credentials = False

if not dataset_downloaded:
  # create .env file containing KAGGLE_USER and KAGGLE_KEY
    kaggle_env = os.path.join(base_dir, '.env')
    if not os.path.isfile(kaggle_env):
        with open(kaggle_env, 'w') as envfile:
            kaggle_user = input("Insert kaggle username")
            kaggle_key = input("Insert kaggle key; generate one from kaggle account")
        if persist_credentials:
            envfile.write(f"""
            KAGGLE_USERNAME={kaggle_user}
            KAGGLE_KEY={kaggle_key}
            """)

        # set env vars
        os.environ["KAGGLE_USERNAME"] = kaggle_user
        os.environ["KAGGLE_KEY"] = kaggle_key

        del kaggle_user
        del kaggle_key

if not dataset_downloaded:
  # loading env vars if .env file exists
    if os.path.isfile(kaggle_env):
        from dotenv import load_dotenv
        load_dotenv(dotenv_path=kaggle_env)
    print(os.environ.get("KAGGLE_USERNAME"))

if not dataset_downloaded:
    # download and extract dataset
    ! kaggle competitions download -c mercari-price-suggestion-challenge

    # create file so that we know we already downloaded
    with open(dataset_downloaded_path, 'w') as dd_file:
        dataset_downloaded = True
        dd_file.write("")

    print('cwd: ', os.getcwd())
    
    os.listdir()

if not dataset_downloaded:
    ! 7z x train.tsv.7z
    ! 7z x test.tsv.7z

os.listdir()

## Load dataset

In [None]:
dtypes={
    'name': 'string',
    'item_condition_id': 'int32',
    'category_name': 'string',
    'brand_name': 'string',
    'price': 'float',
    'shipping': 'int32',
    'item_description': 'string'
}
data = pd.read_csv("train.tsv", sep='\t', dtype=dtypes)
data = data.drop(columns=["train_id"])
print(data.dtypes)
print(data.shape)
data

In [None]:
dtypes={
    'name': 'string',
    'item_condition_id': 'int32',
    'category_name': 'string',
    'brand_name': 'string',
    'price': 'float',
    'shipping': 'int32',
    'item_description': 'string'
}
test = pd.read_csv("test.tsv", sep='\t', dtype=dtypes)
test = test.drop(columns=["test_id"])
print(test.dtypes)
print(test.shape)
test

In [None]:
for column in data.columns:
    print("number of null value in {} : {}".format(column,data[column].isnull().sum()))

In [None]:
for column in test.columns:
    print("number of null value in {} : {}".format(column,test[column].isnull().sum()))

In [None]:
data["category_name"].value_counts()

In [None]:
data["price"].describe()

# Data cleaning

Handle missing values and wrong prices

https://www.mercari.com/us/help_center/article/69

In [None]:
len(data[data["price"]<5])

In [None]:
data=data[data["price"]>=5]

In [None]:
data = data[data["item_description"].notna()]
data["brand_name"] = data["brand_name"].fillna(value="NA")
data["category_name"] = data["category_name"].fillna(value="NA")
# see warnings -> inplace?
data.shape

In [None]:
test["brand_name"] = test["brand_name"].fillna(value="NA")
test["category_name"] = test["category_name"].fillna(value="NA")
# see warnings -> inplace?
test.shape

In [None]:
data

# Preprocessing

In [None]:
data["item_description"]=data["item_description"].str.lower()
data["name"]=data["name"].str.lower()
data.head()

In [None]:
test["item_description"]=test["item_description"].str.lower()
test["name"]=test["name"].str.lower()
test.head()

In [None]:
tweetTokenizer = TweetTokenizer()

def list_to_str(l):
   return ' '.join([str(elem) for elem in l])
   
def textCleanup(df, flag=True):
  df=df.to_frame(name="str")
  #df["clean"] = df["str"].progress_apply(text_to_word_sequence)   # 20 secondi
  global tweetTokenizer
  df["clean"] = df["str"].progress_apply(tweetTokenizer.tokenize) # 2 minutes but correctly handles emojis

  # punct and stop words
  

  lemmatizer = WordNetLemmatizer() 
  
  df["clean"] = df["clean"].progress_apply(lambda sentence : [lemmatizer.lemmatize(word) for word in sentence if word not in stop_words]) # 10 secondi
  df["clean"] = df["clean"].progress_apply(lambda sentence:
                                           [w for w in sentence if w
                                              not in string.punctuation
                                              and w not in stop_words and (len(w)>1 or w.isdigit()) and w not in emoji.UNICODE_EMOJI]) # 18 s
  plot_common_tokens(df["clean"], "Most Common Tokens without StopWords", n=20)
  if flag:
    df["clean"] = df["clean"].progress_apply(list_to_str) # 6 secondi
  return df["clean"]

In [None]:
def preprocessData(data):
  print('description clean up')
  data["item_description_clean"] = textCleanup(data["item_description"]) 

  print('name clean up')
  data["name_clean"] = textCleanup(data["name"])
  
  return data

In [None]:
def flat_list(l):
    return  [item for sublist in l for item in sublist]

In [None]:
def plot_common_tokens(tokens, title, n=20):
    sentences = (list(itertools.chain(tokens)))
    flat_sentences = flat_list(sentences)
    counts = Counter(flat_sentences)
    #print(counts.most_common(30))
    common_words = [word[0] for word in counts.most_common(n)]
    common_counts = [word[1] for word in counts.most_common(n)]
    fig = plt.figure(figsize=(18,6))
    sns.barplot(x=common_words, y=common_counts)
    plt.title(title)
    plt.show()

In [None]:
data = preprocessData(data)
data.head()

In [None]:
test = preprocessData(test)
test.head()

# Load preprocessed

In [None]:
# save preprocessed data train
force_overwrite = False
fname = 'train_preprocess.npy'

if force_overwrite or not os.path.isfile(fname):
  print("saving preprocess train data")
  data.to_pickle(fname)
else:
  print("loading existing preprocess train data")
  data = pd.read_pickle(fname)

In [None]:
force_overwrite = False
fname = 'test_preprocess.npy'

if force_overwrite or not os.path.isfile(fname):
  print("saving preprocess test data")
  test.to_pickle(fname)
else:
  print("loading existing preprocess test data")
  test = pd.read_pickle(fname)

# Normal distribution as regressor

In [None]:
train_normal, validation_normal = train_test_split(data, test_size=0.2, random_state=1000)

In [None]:
mean = train_normal["price"].mean()
std = train_normal["price"].std()

print(f'mean: {mean}, std: {std}')

In [None]:
y_true_normal = validation_normal["price"].values

In [None]:
y_pred_normal = np.random.normal(mean, std, validation_normal.shape[0])
# remoing negative values
y_pred_normal[y_pred_normal < 0] = 0

In [None]:
res = {
    'rmsle': np.sqrt(sklearn.metrics.mean_squared_log_error(y_true_normal, y_pred_normal)),
    'mae': sklearn.metrics.mean_absolute_error(y_true_normal, y_pred_normal),
    'mse': sklearn.metrics.mean_squared_error(y_true_normal, y_pred_normal),
    'rmse': np.sqrt(sklearn.metrics.mean_squared_error(y_true_normal, y_pred_normal))
}
res

{'mae': 32.10479458350734,
 'mse': 2454.257906180274,
 'rmse': 49.540467359324275,
 'rmsle': 1.8666666215367753}

# Categorical Encoding

In [None]:
# TODO ensure it is correct or use a well tested alternative like sklearn (found problems with dimensions)
class LabelEncoder:
  """
  Simple single dimension label encoder class able to handle
  unknown values 
  """
  def __init__(self, unknown = 0, invUnknown = 'unknown'):
    self.leDict = {}
    self.invDict = {}
    self.unknown = unknown
    self.invUnknown = invUnknown

  def fit(self, data):
    vci = pd.value_counts(data).index
    self.leDict = dict(zip(vci, range(1, len(vci)+1)))
    self.invDict = dict(zip(range(1, len(vci)+1), vci))

  def transform1(self, item):
    return self.leDict.get(item, self.unknown)

  def transform(self, data):
    return data.apply(lambda item: self.transform1(item))

  def inverse_transform1(self, item):
    return self.invDict.get(item, self.invUnknown)

  def inverse_transform(self, data):
    return data.apply(lambda item: self.inverse_transform1(item))

In [None]:
cat_le = LabelEncoder()
cat_le.fit(data["category_name"])

data["category_name_l"] = cat_le.transform(data["category_name"])

In [None]:
brand_le = LabelEncoder()
brand_le.fit(data["brand_name"])

data["brand_name_l"] = brand_le.transform(data["brand_name"])

# Test Categorical encoding

In [None]:
test["category_name_l"] = cat_le.transform(test["category_name"])
test["brand_name_l"] = brand_le.transform(test["brand_name"])

# Categorical only

In [None]:
train_cat, validation_cat = train_test_split(data[["item_condition_id", "category_name_l", "brand_name_l", "shipping", "price"]], test_size=0.2, random_state=1000)

In [None]:
def getModelCat():
    inputA = Input(shape=(4,))

    x = Dense(32, activation='relu')(inputA)
    x = Dropout(0.2)(x)
    x = Dense(16, activation='relu')(x)
    
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=inputA, outputs=x)
    
    return model

In [None]:
model_cat = getModelCat()
model_cat.summary()

In [None]:
plot_model(model_cat)

In [None]:
model_cat.compile(loss = root_mean_squared_logarithmic_error, optimizer='adam', metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])

In [None]:
inputA_train_cat = train_cat[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')

In [None]:
inputA_validation_cat = validation_cat[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')

In [None]:
y_train_cat = train_cat["price"].values

In [None]:
y_validation_cat = validation_cat["price"].values

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

history_cat = model_cat.fit(x=inputA_train_cat, y=y_train_cat,
                    epochs=10,
                    verbose=True,
                    validation_data=(inputA_validation_cat, y_validation_cat),
                    callbacks=[callback],
                    batch_size=512)


10/10 2261/2261 [==============================] - 5s 2ms/step - loss: 0.6769 - mse: 1529.0349 - mae: 15.5510 - root_mean_squared_error: 39.1006 - mean_squared_logarithmic_error: 0.4589 - root_mean_squared_logarithmic_error: 0.6769 - val_loss: 0.6703 - val_mse: 1456.1018 - val_mae: 15.3929 - val_root_mean_squared_error: 38.1589 - val_mean_squared_logarithmic_error: 0.4501 - val_root_mean_squared_logarithmic_error: 0.6705


In [None]:
histDf_cat = pd.DataFrame(history_cat.history)
histDf_cat.tail()

In [None]:
histDf_cat.plot(y=["loss", "val_loss"])

# Keras Embedding

In [None]:
# tokenize with keras; it also does some encoding
def tokenizeData(df, description, name, tokenizer= None, texts= None):
  if tokenizer == None:
    tokenizer = Tokenizer()

    print("fit tokenizer")
    tokenizer.fit_on_texts(texts)
  
  print('tokenize description')
  df["item_description_t"]=tokenizer.texts_to_sequences(df[description])

  print('tokenize name')
  df["name_t"]=tokenizer.texts_to_sequences(df[name])
  return df, tokenizer

In [None]:
texts = np.hstack([data["item_description_clean"], data["name_clean"]])

data_keras, tokenizer = tokenizeData(data ,"item_description_clean" ,"name_clean", None, texts)

In [None]:
#data_keras_bk = data_keras
test_keras_bk = test_keras

In [None]:
data_keras=data_keras[["item_condition_id","shipping","category_name_l","brand_name_l", "item_description_t", "name_t", "price"]]
data_keras

In [None]:
test_keras, _ = tokenizeData(test ,"item_description", "name", tokenizer)

In [None]:
test_keras=test[["item_condition_id","shipping","category_name_l","brand_name_l", "item_description_t", "name_t"]]
test_keras

In [None]:
train_keras, validation_keras = train_test_split(data_keras, test_size=0.2, random_state=1000)

In [None]:
vocab_size= len(tokenizer.word_index)+1
print(vocab_size)
# clean con trattamento emoji 193300
# clean 246054
# no clean 255431 (considerando tutto anche punteggiatura)

In [None]:
train_keras[["item_description_t","name_t"]]

In [None]:
desc_length_max=75
inputDesc_train_keras = pad_sequences(train_keras["item_description_t"],
                                                  padding='post', maxlen=desc_length_max)

In [None]:
inputDesc_validation_keras = pad_sequences(validation_keras["item_description_t"],
                                                  padding='post', maxlen=desc_length_max)

In [None]:
name_length_max=10
inputName_train_keras = pad_sequences(train_keras["name_t"], padding='post', maxlen=name_length_max)

In [None]:
inputName_validation_keras = pad_sequences(validation_keras["name_t"], padding='post', maxlen=name_length_max)

In [None]:
y_train_keras = train_keras["price"]
y_train_keras

In [None]:
y_validation_keras = validation_keras["price"]
y_validation_keras

In [None]:
inputA_train_keras = train_keras[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')

In [None]:
inputA_validation_keras = validation_keras[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')

In [None]:
name_vocab_size= vocab_size
desc_vocab_size= vocab_size
def getModelKeras(name_length_max, desc_length_max, name_vocab_size, desc_vocab_size):
    inputA = Input(shape=(4,))
    #Ad = Dense(4, activation='relu')(inputA)
    Ad = inputA
    
    inputName = Input(shape=(name_length_max,))
    Ne = Embedding(input_dim=name_vocab_size, output_dim=50, input_length=name_length_max)(inputName)
    #Ne = Embedding(
    #    num_tokens,
    #    embedding_dim,
    #    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    #    trainable=False,
    #)(inputName)
    Nd = Bidirectional(LSTM(12, return_sequences=True, dropout=0.2))(Ne)
    Nd =  GlobalMaxPool1D()(Nd)

    #Nd = Nf
    
    inputDesc = Input(shape=(desc_length_max,))
    De = Embedding(input_dim=desc_vocab_size, output_dim=50, input_length=desc_length_max)(inputDesc)
    #De = Embedding(
    #    num_tokens,
    #    embedding_dim,
    #    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    #    trainable=False,
    #)(inputDesc)
    Dd = Bidirectional(LSTM(16, return_sequences=True, dropout=0.2))(De)
    Dd = Bidirectional(LSTM(8, return_sequences=True, dropout=0.2))(Dd)
    
    Dd =  GlobalMaxPool1D()(Dd)
    #Dd = Df
    
    concat = Concatenate()([Ad, Nd, Dd])

    x = Dropout(0.2)(concat)    
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(16, activation='relu')(x)
    
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[inputA, inputName, inputDesc], outputs=x)
    
    return model
    

In [None]:
model_keras = getModelKeras(name_length_max, desc_length_max, name_vocab_size, desc_vocab_size)
model_keras.summary()

In [None]:
plot_model(model_keras)

In [None]:
model_keras.compile(loss = root_mean_squared_logarithmic_error, optimizer='adam', metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])

https://machinelearningmastery.com/clean-text-machine-learning-python/ ultima sezione

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)

history_keras = model_keras.fit(x=[inputA_train_keras,inputName_train_keras, inputDesc_train_keras], y=y_train_keras,
                    epochs=10,
                    verbose=True,
                    validation_data=([inputA_validation_keras ,inputName_validation_keras, inputDesc_validation_keras], y_validation_keras),
                    callbacks=[callback],
                    batch_size=512)
# senza pulizia  loss: 0.4100  e val_root_mean_squared_logarithmic_error: 0.4543
# con pulizia leggera (lower)  val_root_mean_squared_logarithmic_error: 0.455 e loss: 0.4
# con pulizia val_root_mean_squared_logarithmic_error: 0.4643 e loss: 0.4269
# con lunghezza embedding 50 e senza pulizia val_root_mean_squared_logarithmic_error: 0.4556 e loss: 0.3675 con 10 epoche 

In [None]:
histDf_keras = pd.DataFrame(history_keras.history)
histDf_keras.tail()

In [None]:
histDf_keras.plot(y=["loss", "val_loss"])

In [None]:
inputA_test_keras = test_keras[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')
inputName_test_keras = pad_sequences(test_keras["name_t"], padding='post', maxlen=name_length_max)
inputDesc_test_keras = pad_sequences(test_keras["item_description_t"], padding='post', maxlen=desc_length_max)
pred_keras = model_keras.predict([inputA_test_keras, inputName_test_keras, inputDesc_test_keras])
pred_keras

In [None]:
import math

def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

In [None]:
validation_keras

In [None]:
pred_keras=pred_keras.round()

In [None]:
pred_v_keras = model.predict([inputA_validation_keras, inputName_validation_keras, inputDesc_validation_keras])
pred_v_keras = pred_v_keras.round()

In [None]:
len(pred_v_keras)

In [None]:
len(y_validation_keras)

In [None]:
root_mean_squared_logarithmic_error(y_validation_keras, pred_v_keras)

# Glove pretrained

https://nlp.stanford.edu/projects/glove/

In [None]:
#archive_url = 'http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.6B.zip'
#archive_url = "https://github.com/facebookresearch/fastText/archive/v0.9.2.zip"
#archive_url = "https://www.cs.uic.edu/~hxu/ele_review_qa_300d.tar.gz"
archive_url = 'http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.840B.300d.zip'
archive_name = 'glove.840B.300d.zip'
remove_archive = False
embedding_fname = 'glove.840B.300d.txt'

In [None]:
if not os.path.isfile(archive_name) and not os.path.isfile(embedding_fname):
  ! wget {archive_url} -O {archive_name}

In [None]:
if not os.path.isfile(embedding_fname):
  ! unzip {archive_name} {embedding_fname}

# remove archive if already extracted
if remove_archive and os.path.isfile(embedding_fname):
  os.remove(archive_name)

Needs a word encoding index (e.g. keras one)

In [None]:
word_index = tokenizer.word_index
print(len(word_index))

In [None]:
path_to_glove_file = os.path.join(
    base_dir, embedding_fname
)

'''embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))'''

In [None]:
path_to_glove_file

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index (padding)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
              i = word_index[word]
              try: 
                embedding_matrix[i] = np.array(vector, dtype=np.float32)[:embedding_dim]
              except:
                print(word)
                print(vector)
                print("_______________________")

    return embedding_matrix

In [None]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index (padding)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
              i = word_index[word]
              try: 
                embedding_matrix[i] = np.array(vector, dtype=np.float32)[:embedding_dim]
                
  
              except:

                vector=vector[1:embedding_dim+1]
                embedding_matrix[i] = np.array(vector, dtype=np.float32)


    return embedding_matrix

In [None]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix(path_to_glove_file, tokenizer.word_index, embedding_dim)

In [None]:
embedding_matrix[10]

In [None]:
embedding_matrix[107098]

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size
# senza pulizia viene coperto il 28% del vocabolario
# con pulizia il 26% senza emoji
# con pulizia con emoji 34%
# 0.3779 senza pulizia con glove più grande (glove.840B.300d)
# 0.467 con pulizia con glove più grande (glove.840B.300d)

In [None]:
'''#glove pretrained embedding
num_tokens = len(word_index) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))'''

**IMPORTANTISSIMO: SCEGLIERE CHE LUNGHEZZA USARE PER L'ECONDING DELLE PAROLE DELLE DESCRIZIONI E DEI NOMI, CIOE' SE 8 VALORI ENTRAMBI O MANTENERE 8 E 16 VALORI**

In [None]:
name_vocab_size= vocab_size
desc_vocab_size= vocab_size
def getModelGlove(name_length_max, desc_length_max, name_vocab_size, desc_vocab_size, embedding_matrix=None):
    inputA = Input(shape=(4,))
    Ad = inputA
    
    inputName = Input(shape=(name_length_max,))
    if embedding_matrix.size == 0:
      Ne = Embedding(input_dim=name_vocab_size, output_dim=50, weights=[embedding_matrix], input_length=name_length_max, trainable=False)(inputName)
    else: 
      Ne = Embedding(input_dim=name_vocab_size, output_dim=50, input_length=name_length_max)(inputName)

    Nd = Bidirectional(LSTM(12, return_sequences=True, dropout=0.2))(Ne)
    Nd =  GlobalMaxPool1D()(Nd)
    
    inputDesc = Input(shape=(desc_length_max,))
    if embedding_matrix.size == 0:
      De = Embedding(input_dim=desc_vocab_size, output_dim=50, weights=[embedding_matrix], input_length=desc_length_max, trainable=False)(inputDesc)
    else:
      De = Embedding(input_dim=desc_vocab_size, output_dim=50, input_length=desc_length_max)(inputDesc)


    Dd = Bidirectional(LSTM(16, return_sequences=True, dropout=0.2))(De)
    Dd = Bidirectional(LSTM(8, return_sequences=True, dropout=0.2))(Dd)
    
    Dd =  GlobalMaxPool1D()(Dd)
    
    concat = Concatenate()([Ad, Nd, Dd])

    x = Dropout(0.2)(concat)    
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = Dense(16, activation='relu')(x)
    
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[inputA, inputName, inputDesc], outputs=x)
    
    return model
    

In [None]:
model_glove= getModelGlove(name_length_max, desc_length_max, name_vocab_size, desc_vocab_size, embedding_matrix)
model_glove.summary()

In [None]:
plot_model(model_glove)

In [None]:
model_glove.compile(loss = root_mean_squared_logarithmic_error, optimizer='adam', metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

history_glove = model_glove.fit(x=[inputA_train_keras,inputName_train_keras, inputDesc_train_keras], y=y_train_keras,
                    epochs=10,
                    verbose=True,
                    validation_data=([inputA_validation_keras ,inputName_validation_keras, inputDesc_validation_keras], y_validation_keras),
                    callbacks=[callback],
                    batch_size=512)
# glove 6M 
# con 10 epoche e senza pulizia: loss: 0.5564 e val_root_mean_squared_logarithmic_error: 0.5392
# con 10 epoche e pulizia emoji etc: loss: 0.5553 e val_root_mean_squared_logarithmic_error: 0.5312

# glove.840B.300d
# con 10 epoche senza pulizia: loss: 0.3678  e val_root_mean_squared_logarithmic_error: 0.4537
# con 10 epoce con pulizia: loss: 0.3817 e val_root_mean_squared_logarithmic_error 0.4604

# BoW

In [None]:
data.head()

In [None]:
train_cvec, validation_cvec = train_test_split(data, test_size=0.2, random_state=1000)

In [None]:
categorical_train_cvec= train_cvec[["item_condition_id","shipping","category_name_l","brand_name_l"]]
print("Train:")
print(categorical_train_cvec.head())
print("\nValidation:")
categorical_validation_cvec= validation_cvec[["item_condition_id","shipping","category_name_l","brand_name_l"]]
print(categorical_validation_cvec.head())

In [None]:
y_train_cvec=train_cvec["price"].values
y_validation_cvec=validation_cvec["price"].values

## Count Vectorizer

In [None]:
vectorizer_desc = CountVectorizer()
vectorizer_desc.fit(data["item_description"].values)

In [None]:
len(vectorizer_desc.vocabulary_)

In [None]:
description_train_cvec = vectorizer_desc.transform(train_cvec["item_description"].values)

In [None]:
description_validation_cvec = vectorizer_desc.transform(validation_cvec["item_description"].values)

In [None]:
type(description_validation_cvec)

In [None]:
vectorizer_name = CountVectorizer()
vectorizer_name.fit(data["name"].values)

In [None]:
len(vectorizer_name.vocabulary_)

In [None]:
name_train_cvec = vectorizer_name.transform(train_cvec["name"].values)
name_validation_cvec = vectorizer_name.transform(validation_cvec["name"].values)

In [None]:
type(name_train_cvec)

In [None]:
def getModel_bow(categorical_train, name_train, description_train):
    inputA = Input(categorical_train.shape[1])
    inputName = Input(name_train.shape[1])
    inputDesc = Input(description_train.shape[1])
    concat = Concatenate()([inputName, inputDesc, inputA])

    x = Dropout(0.1)(concat)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(16, activation='relu')(x)
    
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[inputName ,inputDesc, inputA], outputs=x)

    return model

In [None]:
model_cvec = getModel_bow(categorical_train_cvec, name_train_cvec, description_train_cvec)
model_cvec.summary()

In [None]:
plot_model(model_cvec)

In [None]:
model_cvec.compile(loss = root_mean_squared_logarithmic_error, optimizer='adam', metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])

In [None]:
history_cvec = model_cvec.fit(x=[name_train_cvec, description_train_cvec, categorical_train_cvec.values],
                    y=y_train_cvec,
                    epochs=5,
                    verbose=True,
                    validation_data=(
                        [name_validation_cvec, description_validation_cvec, categorical_validation_cvec.values]
                        , y_validation_cvec),
                    batch_size=512)

BOW con pulizia:
loss: 0.4549 - mse: 877.1694 - mae: 10.7754 - root_mean_squared_error: 29.6144 - mean_squared_logarithmic_error: 0.2073 - root_mean_squared_logarithmic_error: 0.4549 - val_loss: 0.4572 - val_mse: 823.4496 - val_mae: 10.7306 - val_root_mean_squared_error: 28.6958 - val_mean_squared_logarithmic_error: 0.2095 - val_root_mean_squared_logarithmic_error: 0.4573

BOW senza pulizia words:
loss: 0.4537 - mse: 892.7444 - mae: 10.7434 - root_mean_squared_error: 29.8743 - mean_squared_logarithmic_error: 0.2063 - root_mean_squared_logarithmic_error: 0.4537 - val_loss: 0.4554 - val_mse: 848.4371 - val_mae: 10.7117 - val_root_mean_squared_error: 29.1279 - val_mean_squared_logarithmic_error: 0.2079 - val_root_mean_squared_logarithmic_error: 0.4555

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    rmsle = history.history['root_mean_squared_logarithmic_error']
    val_rmsle = history.history['val_root_mean_squared_logarithmic_error']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(rmsle) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, rmsle, 'b', label='Training rmsle')
    plt.plot(x, val_rmsle, 'r', label='Validation rmsle')
    plt.title('Training and validation root_mean_squared_logarithmic_error')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
data["item_description"] = data["item_description"]
data["name"] = data["name"]

In [None]:
tfidf_vectorizer_desc = TfidfVectorizer()
tfidf_vectorizer_desc.fit(data["item_description"].values)

In [None]:
len(tfidf_vectorizer_desc.vocabulary_)

In [None]:
tfidf_description_train = tfidf_vectorizer_desc.transform(train_cvec["item_description"].values)
tfidf_description_validation = tfidf_vectorizer_desc.transform(validation_cvec["item_description"].values)

In [None]:
tfidf_vectorizer_name = TfidfVectorizer()
tfidf_vectorizer_name.fit(data["name"])

In [None]:
len(tfidf_vectorizer_name.vocabulary_)

In [None]:
tfidf_name_train = tfidf_vectorizer_name.transform(train_cvec["name"])
tfidf_name_validation = tfidf_vectorizer_name.transform(validation_cvec["name"])

In [None]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [None]:
type(tfidf_name_train) 

In [None]:
#https://stackoverflow.com/questions/61961042/indices201-0-8-is-out-of-order-many-sparse-ops-require-sorted-indices-use

In [None]:
import scipy
new_categorical_train_cvec=scipy.sparse.csr_matrix(categorical_train_cvec.values)
new_categorical_validation_cvec=scipy.sparse.csr_matrix(categorical_validation_cvec.values)

In [None]:
new_tfidf_categorical_validation=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(new_categorical_validation_cvec))
new_tfidf_categorical_train=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(new_categorical_train_cvec))

In [None]:
new_tfidf_name_validation=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(tfidf_name_validation))
new_tfidf_name_train=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(tfidf_name_train))

In [None]:
type(new_tfidf_name_validation)

In [None]:
type(tfidf_name_validation)

In [None]:
new_tfidf_description_validation=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(tfidf_description_validation))
new_tfidf_description_train=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(tfidf_description_train))

In [None]:
'''new_y_train_cvec=scipy.sparse.csr_matrix(y_train_cvec)
new_y_validation_cvec=scipy.sparse.csr_matrix(y_validation_cvec)'''

In [None]:
'''new_tfidf_y_validation=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(new_y_validation_cvec))
new_tfidf_y_train=tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(new_y_train_cvec))'''

In [None]:
'''type(tf.sparse.reorder(convert_sparse_matrix_to_sparse_tensor(new_y_train_cvec)))'''

In [None]:
def getModel_bow(name_train, description_train, categorical_train):
    inputA = Input(categorical_train.shape[1])
    inputName = Input(name_train.shape[1])
    inputDesc = Input(description_train.shape[1])
    concat = Concatenate()([inputName, inputDesc, inputA])

    #x = Dropout(0.1)(concat)
    x = Dense(32, activation='relu')(concat)
    x = Dropout(0.1)(x)
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[inputName ,inputDesc, inputA], outputs=x)

    return model

In [None]:
model = getModel_bow(new_tfidf_name_train, new_tfidf_description_train, categorical_validation_cvec)
model.summary() 

In [None]:
plot_model(model)

In [None]:
model.compile(loss = root_mean_squared_logarithmic_error, optimizer='adam', metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])

In [None]:
history = model.fit(x=[new_tfidf_name_train, new_tfidf_description_train, new_tfidf_categorical_train], y=y_train_cvec,
                    epochs=5,
                    verbose=True,
                    validation_data=([new_tfidf_name_validation, new_tfidf_description_validation, new_tfidf_categorical_validation], y_validation_cvec),
                    batch_size=512)
# 0.4771

In [None]:
history2 = model.fit(x=[new_tfidf_name_train, new_tfidf_description_train], y=y_train_cvec,
                    epochs=5,
                    verbose=True,
                    validation_data=([new_tfidf_name_validation, new_tfidf_description_validation], y_validation_cvec),
                    batch_size=512)

In [None]:
histdf = pd.DataFrame(history2.history)

In [None]:
histdf.plot(y=["loss", "val_loss"])

tf-idf gensim

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.models import TfidfModel

In [None]:
data["item_description"]

In [None]:
doc_tokenized = [simple_preprocess(doc) for doc in data["item_description"].values]

In [None]:
doc_tokenized[0]

In [None]:
dictionary = corpora.Dictionary()

In [None]:
BoW_corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in doc_tokenized]

In [None]:
# stampa le parole che compaiono in ogni descrizione con la loro frequenza di apparizione
for doc in BoW_corpus[:10]:
   print([[dictionary[id], freq] for id, freq in doc])

In [None]:
tfidf = TfidfModel(BoW_corpus)

In [None]:
for doc in tfidf[BoW_corpus]:
   print([[dictionary[id], np.around(freq)] for id, freq in doc])

In [None]:
data["tfidf"]=tfidf[BoW_corpus]

In [None]:
len(data.iloc[10]["tfidf"])

In [None]:
train_tfidf, validation_tfidf = train_test_split(data[["item_condition_id", "shipping", "category_name_l",	"brand_name_l", "tfidf", "price"]], test_size=0.2, random_state=1000)

In [None]:
train_tfidf_categorical=train_tfidf[["item_condition_id", "shipping", "category_name_l",	"brand_name_l"]]
validation_tfidf_categorical=validation_tfidf[["item_condition_id", "shipping", "category_name_l",	"brand_name_l"]]


train_tfidf_description=train_tfidf["tfidf"]
validation_tfidf_description=validation_tfidf["tfidf"]

y_train=train_tfidf["price"]
y_validation=validation_tfidf["price"]

In [None]:
(validation_tfidf_description[0:10].todense)

In [None]:
def getModel_bow(categorical_train, description_train):
    inputA = Input(categorical_train.shape[1])
    inputDesc = Input(description_train.shape[1])
    concat = Concatenate()([inputDesc, inputA])

    x = Dropout(0.1)(concat)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(16, activation='relu')(x)
    
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[inputDesc, inputA], outputs=x)

    return model

In [None]:
model = getModel_bow(train_tfidf_categorical ,train_tfidf_description)
model.summary() 

In [None]:
train_tfidf_description.shape

# Transformers

In [None]:
pret_model_trans = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
pret_model_trans.trainable = False

tokenizer_trans = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', use_fast=True)

In [None]:
num_batches = 100

trans_batch_starti = list(range(0, data.shape[0], int(data.shape[0] / num_batches)))[0:num_batches]
trans_batch_starti.append(data.shape[0])

In [None]:
trans_batches = [(trans_batch_starti[i], trans_batch_starti[i+1]) for i in range(0,len(trans_batch_starti)-1)]
[(i,b) for i,b in enumerate(trans_batches)]

In [None]:
# divide in batch as we don't have enough memory to handle all at once
# 0 to 9
trans_batch_number = 0

trans_batch = trans_batches[trans_batch_number]
trans_batch

In [None]:
print("Total shape", data.shape)
data_batch = data[trans_batch[0]:trans_batch[1]]
print("Batch shape", data_batch.shape)

train_trans, validation_trans = train_test_split(data_batch, test_size=0.2, random_state=1000)

In [None]:
y_train_trans = train_trans["price"]
y_train_trans

In [None]:
y_validation_trans = validation_trans["price"]
y_validation_trans

todo: use cleaned? or let bert handle everything?

In [None]:
inputA_train_trans = train_trans[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')

In [None]:
inputA_validation_trans = validation_trans[["item_condition_id", "category_name_l", "brand_name_l", "shipping"]].values.astype('int32')

In [None]:
trans_name_tokenizer_maxlength = 20
trans_desc_tokenizer_maxlength = 350

In [None]:
inputName_train_trans = tokenizer_trans(train_trans["name"].to_list(),
                                  return_tensors="tf",
                                  padding='max_length',
                                  truncation=True,
                                  max_length = trans_name_tokenizer_maxlength)
inputName_train_trans['input_ids'].shape

In [None]:
inputName_validation_trans = tokenizer_trans(validation_trans["name"].to_list(),
                                  return_tensors="tf",
                                  padding='max_length',
                                  truncation=True,
                                  max_length = trans_name_tokenizer_maxlength)
inputName_validation_trans['input_ids'].shape

In [None]:
inputName_shape_trans = (inputName_train_trans['input_ids'].shape[1],
                         inputName_train_trans['attention_mask'].shape[1])

inputName_shape_trans

In [None]:
inputDesc_train_trans = tokenizer_trans(train_trans["item_description_clean"].to_list(),
                                  return_tensors="tf",
                                  padding='max_length',
                                  truncation=True,
                                  max_length=trans_desc_tokenizer_maxlength)
inputDesc_train_trans['input_ids'].shape

In [None]:
inputDesc_validation_trans = tokenizer_trans(validation_trans["item_description_clean"].to_list(),
                                  return_tensors="tf",
                                  padding='max_length',
                                  truncation=True,
                                  max_length=trans_desc_tokenizer_maxlength)
inputDesc_validation_trans['input_ids'].shape

In [None]:
inputDesc_shape_trans = (inputDesc_train_trans['input_ids'].shape[1],
                         inputDesc_train_trans['attention_mask'].shape[1])
inputDesc_shape_trans

In [None]:
def getModel_trans():
    inputA = Input(shape=(4,))
    Ad = inputA    
    
    inputName_ids = Input(shape=(inputName_shape_trans[0],), dtype='int32')
    inputName_mask = Input(shape=(inputName_shape_trans[1],), dtype='int32')

    Np = pret_model_trans(inputName_ids, attention_mask=inputName_mask)[0]

    Nd = GlobalMaxPool1D()(Np)

    inputDesc_ids = Input(shape=(inputDesc_shape_trans[0],), dtype='int32')
    inputDesc_mask = Input(shape=(inputDesc_shape_trans[1],), dtype='int32')

    Dp = pret_model_trans(inputDesc_ids, attention_mask=inputDesc_mask)[0]

    Dd = GlobalMaxPool1D()(Dp)

    concat = Concatenate()([Ad, Nd, Dd])

    x = Dropout(0.1)(concat)    
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.1)(x)
    x = Dense(16, activation='relu')(x)
    
    x = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[
                          inputA,
                          inputName_ids,
                          inputName_mask,
                          inputDesc_ids,
                          inputDesc_mask
                          ], outputs=x)
    
    return model
    

In [None]:
model_trans = getModel_trans()

In [None]:
model_trans.summary()

In [None]:
plot_model(model_trans)

In [None]:
model_trans.compile(loss = root_mean_squared_logarithmic_error, optimizer='adam', metrics=['mse', 'mae', tf.keras.metrics.RootMeanSquaredError(), 'mean_squared_logarithmic_error', root_mean_squared_logarithmic_error])

In [None]:
history_trans = model_trans.fit(x=[inputA_train_trans,
                   inputName_train_trans['input_ids'],
                   inputName_train_trans['attention_mask'],
                   inputDesc_train_trans['input_ids'],
                   inputDesc_train_trans['attention_mask']
                  ],
                   y=y_train_trans,
                    epochs=5,
                    verbose=True,
                    validation_data=([
                                      inputA_validation_trans,
                                      inputName_validation_trans['input_ids'],
                                      inputName_validation_trans['attention_mask'],
                                      inputDesc_validation_trans['input_ids'],
                                      inputDesc_validation_trans['attention_mask'],
                                      ], 
                                     y_validation_trans),
                    batch_size=512)