# Load the model and get embeddings

In [1]:
%load_ext autoreload
%autoreload 2

In [77]:
import os
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

In [3]:
import spacy

In [4]:
#keras tokenizer
from keras.preprocessing import text
from keras.preprocessing import sequence # for import pad_sequences

In [5]:
#pytorch tokenizer
import torch
import torch.nn as nn
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import vocab


from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks import ModelSummary, LearningRateMonitor

from pytorch_lightning.loggers import WandbLogger


from torch_lr_finder import LRFinder

In [6]:
from collections import Counter, OrderedDict, defaultdict

import json

import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

In [7]:
from IPython.core.debugger import set_trace

In [8]:
import socket
import re

In [9]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
#os.environ["WANDB_NOTEBOOK_NAME"] = os.path.join(os.getcwd(), "notebook")

#https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
os.environ["UBLAS_WORKSPACE_CONFIG"] = ":4096:8"

In [10]:
import joblib

In [11]:
import wandb

In [12]:
import gc
import random
import numpy as np
np.set_printoptions(suppress=True)
#np.set_printoptions(threshold=np.inf)
torch.set_printoptions(sci_mode = False)

is_cuda = torch.cuda.is_available()


if is_cuda:
    print(is_cuda)
    print(torch.cuda.current_device())
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
else:
    device = torch.device("cpu")

    
print('Using device:', device)   
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
#torch.use_deterministic_algorithms(True)

#print(print(torch.cuda.memory_summary(device=None, abbreviated=False)))

True
0
1
Quadro RTX 4000
Using device: cuda


In [13]:
import sys
sys.path.append("./prediction/")

import pytorch_dataset as pytorch_dataset
import pytorch_model as pytorch_model
import utils as utils

In [14]:
model_dump_path = "./model"

## Load Model and Data Dump

In [17]:
file_template = "epoch07-loss0.24-val_loss0.34-rmsle0.429"
data_dump_file = f"data_dump_{file_template}.dump"
model_file = f"{file_template}.ckpt"

In [18]:
data_dump = joblib.load(os.path.join(model_dump_path, data_dump_file))

In [19]:
data_dump.keys()

dict_keys(['train_set', 'dd_train', 'validation_set', 'model'])

In [20]:
dd_train = data_dump["dd_train"]

In [21]:
train = data_dump["train_set"].reset_index(drop=True)
validation = data_dump["validation_set"].reset_index(drop=True)

In [22]:
train.shape

(1185328, 16)

In [23]:
validation.shape

(296333, 16)

In [41]:
set(validation["name"]) - set(train["name"])

{'southpole black down jacket sz s',
 'vs pink pastel watercolor bikini top',
 "jordan retro 3's sz 4.5y",
 'express portofino blouse size m',
 'new becca moonstone skin perfector',
 "maurice's jeggings 1",
 'american eagle girls boots size 1y',
 'lularoe russian nesting dolls os htf',
 'fleet foxes cd',
 'bnib le morphe copper spice palette!!',
 'black nude lace peep toe heals/pumps',
 'lululemon minimalist tank',
 "victoria's secret 5 pair panties size xl",
 'vs 3pcs set love spell',
 'vintage coach flats',
 'gold chanel fashion wine marker charms',
 'm 7/8 girls nwt dance danskin top',
 'bundle for angelzlyne',
 'victoria secret bag used',
 'high waisted bikini swim suit',
 'monster high minis wave 3',
 'justice kids size 18',
 'pink ranger movie figure hold',
 'totoro wallet',
 'california waves bikini',
 'dooney & bourke madras shopper',
 'oliver peoples jacey sunglasses polarize',
 'ugly sweater medium',
 'sale blunt razor cut bob lace part wig',
 "minnetonka ankle women's boots 

# Mercari Price Prediction

In [24]:
import category_encoders as ce
import feature_engine.encoding as fe
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

from sklearn.pipeline import Pipeline

import sklearn.metrics as metrics

## Pytorch Lightning

In [25]:
dd_validation = pytorch_dataset.build_test_dataset(dd_train, validation)

target: price_log
train: 1185328
test: (296333, 16)
test set mode
=> target encoding
=> numerical encoding
=> categorical encoding
name vocabulary size 104570
item_description vocabulary size 184473
name vocabulary size 455
item_description vocabulary size 995
target min, max range (-2.7831687470788093, 4.21270471801917)


In [26]:
model = pytorch_model.PytorchModel.load_from_checkpoint(checkpoint_path= os.path.join(model_dump_path, model_file))

"categorical_embedding_dropout":   0.4
"categorical_embedding_size":      [(6, 4), (4522, 178), (12, 6), (115, 23), (866, 71)]
"char_bidirectional":              False
"char_embedding_dimension":        40
"char_recurrent_hidden_size":      50
"char_recurrent_layers":           1
"char_rnn":                        LSTM
"char_vocabulary_size":            {'name': 455, 'item_description': 995}
"final_linear_layer":              True
"final_normalization":             False
"is_target_log":                   True
"learning_rate":                   0.001
"linear_layer_activation":         ReLU(inplace=True)
"linear_layer_normalization":      BatchNorm1d
"linear_layer_skip_connections":   (3, ([1024], [0.3]))
"linear_layers":                   ([512], [0.2])
"loss_function":                   MSELoss()
"metric_to_monitor":               rmsle
"normalization_before_activation": True
"numerical_batch_normalization":   True
"numerical_input_size":            5
"optimizer":                     

In [27]:
model

PytorchModel(
  (metric): MeanSquaredError()
  (loss_function): MSELoss()
  (embeds): ModuleList(
    (0): Embedding(6, 4, padding_idx=0)
    (1): Embedding(4522, 178, padding_idx=0)
    (2): Embedding(12, 6, padding_idx=0)
    (3): Embedding(115, 23, padding_idx=0)
    (4): Embedding(866, 71, padding_idx=0)
  )
  (categorical_dropout): Dropout(p=0.4, inplace=False)
  (batch_normalization_numerical): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (text_embeddings): ModuleList(
    (0): TextRecurrentLayer(
      (embedding): Embedding(104570, 50, padding_idx=0)
      (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
      (last_time_step): LastTimeStep()
    )
    (1): TextRecurrentLayer(
      (embedding): Embedding(184473, 50, padding_idx=0)
      (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
      (last_time_step): LastTimeStep()
    )
  )
  (char_embeddings): ModuleList(
    (0): TextRecurrentLayer(
  

## Try manual input/output

In [28]:
model.sizes

[787, 1024, 512, 1]

In [29]:
hidden_layer_size = model.sizes[0]
hidden_layer_size

787

## Test Set Hidden Layer (Concatenated Layer)

In [None]:
product_emb_hidden_test = utils.get_product_embeddings_from_hidden_layer(model, validation, dd_train, hidden_layer_size = hidden_layer_size, item_column="name")

In [None]:
product_emb_hidden_train = utils.get_product_embeddings_from_hidden_layer(model, train, dd_train, hidden_layer_size = hidden_layer_size, item_column="name")

## Similarity using Hidden Layer Representation

In [133]:
search_product = "speacker"
search_product

product_index = validation[validation["name"] == search_product].index.to_list()
product_index

[236399]

In [134]:
product_name = search_product
cos = cosine_similarity(product_emb_hidden_train["matrix"], product_emb_hidden_test["product2vector"][f"{search_product}_{product_index[0]}"].reshape(1, -1))
print(cos.shape)
top_n = np.argsort(cos, axis=0, )[-10:][::-1].reshape(-1)
top_similarity = cos[top_n].reshape(-1)
print(top_n, top_similarity)

(1185328, 1)
[ 260223  794340  844688  185231   79511  840326  512089 1125664  544490
  468422] [0.7802913  0.77537956 0.76305055 0.76202467 0.75976259 0.75414294
 0.75188236 0.75011927 0.7453418  0.74497159]


In [135]:
tmp = validation[validation["name"] == search_product]

html = tmp[["name", "item_condition_id", "category_name", "brand_name", "shipping", "item_description", "is_brand_missing", "price"]].to_html(classes='table table-stripped')
text_file = open(f"{search_product}_search_hidden_layer.html", "w")
text_file.write(html)
text_file.close()

tmp

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,desc_len,name_len,subcategory_1,subcategory_2,subcategory_3,is_brand_missing,is_item_description_missing,price,price_log
236399,232689,speacker,1,"Electronics/TV, Audio & Surveillance/Home Speakers & Subwoofers",missing,0.0,pink speacker,2.0,1.0,electronics,"tv, audio & surveillance",home speakers & subwoofers,1.0,0.0,11.0,2.484907


In [136]:
train[train["name"] == search_product]

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,desc_len,name_len,subcategory_1,subcategory_2,subcategory_3,is_brand_missing,is_item_description_missing,price,price_log


In [137]:
from IPython.display import HTML
tmp = train.iloc[top_n].copy().assign(similarity = top_similarity).head()[["name", "item_condition_id", "category_name", "brand_name", "shipping", "item_description", "is_brand_missing", "price", "similarity"]]
html = tmp.to_html(classes='table table-stripped')
text_file = open(f"{search_product}_hidden_layer.html", "w")
text_file.write(html)
text_file.close()

tmp

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,is_brand_missing,price,similarity
260223,waterproof speaker,3,"Electronics/TV, Audio & Surveillance/Home Speakers & Subwoofers",missing,0.0,perfect for the shower!,1.0,14.0,0.780291
794340,light up speaker,1,"Electronics/TV, Audio & Surveillance/Home Speakers & Subwoofers",missing,0.0,light up bluetooth speaker loud and works great great condition lights up in many different colors really entertaining,1.0,11.0,0.77538
844688,wireless shower speaker,1,"Electronics/TV, Audio & Surveillance/Home Speakers & Subwoofers",missing,0.0,aqua sound wireless shower speaker you can: answer/end calls built in mic for calls speaker bluetooth hookup to play music from your phone new never used,1.0,11.0,0.763051
185231,water speakers,2,"Electronics/TV, Audio & Surveillance/Home Speakers & Subwoofers",missing,0.0,comes with plug for the wall and aux cord used maybe once just sitting on my dresser,1.0,14.0,0.762025
79511,google home,1,"Electronics/TV, Audio & Surveillance/Home Speakers & Subwoofers",missing,0.0,never used only taken out of the box has the cord.,1.0,71.0,0.759763


In [71]:
import joblib
product_similarity_emb_hidden = {'train': train, 'test': validation,
                      'product_train_embedding_matrix': product_emb_hidden_train["matrix"], 
                      'products_test_vector_dict': product_emb_hidden_test["product2vector"],
                      'products_train_vector_dict': product_emb_hidden_train["product2vector"],
                      
                     'unknown_products': []}



joblib.dump(product_similarity_emb_hidden, os.path.join(model_dump_path, "product_similarity_emb_hidden.joblib"))


['./model\\product_similarity_emb_hidden']

## Product Embeddings

In [49]:
model.text_embeddings

ModuleList(
  (0): TextRecurrentLayer(
    (embedding): Embedding(104570, 50, padding_idx=0)
    (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
    (last_time_step): LastTimeStep()
  )
  (1): TextRecurrentLayer(
    (embedding): Embedding(184473, 50, padding_idx=0)
    (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
    (last_time_step): LastTimeStep()
  )
)

In [51]:
# embeddings can have only one product consisting of the same words. Therefore we can prepare our data by removing duplicates on product name level

validation_products_unique = validation.drop_duplicates(subset = "name", keep = "last").reset_index(drop=True)
print(f"validation size: {len(validation_products_unique)}")
train_products_unique = train.drop_duplicates(subset = "name", keep = "last").reset_index(drop=True)
print(f"train size: {len(train_products_unique)}")

validation size: 254660
train size: 928071


In [None]:
product_emb_emb_test = utils.get_product_embeddings_from_embedding_layer(model, validation_products_unique["name"],  dd_train, item_column="name")

In [53]:
# unknown embeddings
product_emb_emb_test["unknown"]

['lizzz',
 'j❤️j',
 'bambiblu',
 'mbiki',
 'quibbler',
 'waxvac',
 'complications',
 'jachelle',
 '❗️vsx tank❗️xs',
 'tennispuma',
 "yvonne's",
 'baketball',
 'urbandecayallnighterwaterprooffoundation',
 'uggzzz',
 'cv012817',
 'itscosmeticscccreamsamplea',
 'xirma',
 'choker‼️',
 'jeffreystarcosmeticsskinfrost-icecold',
 'mediastinum',
 'limecrimechinadoll/sugarpillheartbreake',
 'tomahawk',
 'ғᴏʀ ᴋᴇɴᴛᴜᴄᴋʏ85',
 '♡♡nars bundle♡♡',
 '⭐️barcelona jersey⭐️',
 'lesliexcloset',
 "bow's",
 'biankas',
 'ardi',
 'vercace perfumev',
 'fcm',
 'redick',
 '☆vampress☆',
 '✨eyeshadow✨',
 'happy775',
 'bellossum',
 '4theluvoflouis',
 '**cherishbundle**',
 '⚡elizabeth ameral⚡',
 "dj'set",
 'chelsea1',
 'haleyedmunds (:',
 'cupcakes91',
 'ams1991',
 'ruben',
 'helloween',
 'fruiiityslime',
 'bjtownsend',
 'vibrams!',
 'girlactorsneakerhead',
 'heebwife',
 'danellco',
 'ᐯᗩᑎᔕ ᗩᑎᑕᕼoᖇ ᗷᑌᖇgᑌᑎᗪy tᗩᑎk',
 'eyeluminator',
 'sunsilk',
 'maryr40',
 'wheedle',
 'caylyn tamayo',
 'xim4',
 'reselling',
 'bridgetollb

In [None]:
product_emb_emb_train = utils.get_product_embeddings_from_embedding_layer(model, train_products_unique["name"], dd_train, item_column="name")

In [None]:
product_emb_emb_test["product2vector"].keys()

In [139]:
search_product = "express portofino blouse size m"
product_name = search_product
cos = cosine_similarity(product_emb_emb_train["matrix"], product_emb_emb_test["product2vector"][product_name].reshape(1, -1))
print(cos.shape)
top_n = np.argsort(cos, axis=0, )[-10:][::-1].reshape(-1)
top_similarity = cos[top_n].reshape(-1)
print(top_n, top_similarity)

(928071, 1)
[431918 244818 342449 347572 349672 361556 588136 923594 555160 381739] [0.76878326 0.69302587 0.6872537  0.67367575 0.67263427 0.65950813
 0.65465092 0.65143845 0.65067749 0.64973632]


In [140]:
tmp = validation_products_unique[validation_products_unique["name"] == search_product]

html = tmp[["name", "item_condition_id", "category_name", "brand_name", "shipping", "item_description", "is_brand_missing", "price"]].to_html()#(classes='table table-stripped')
text_file = open(f"{search_product}_search_word_embedding.html", "w")
text_file.write(html)
text_file.close()

tmp

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,desc_len,name_len,subcategory_1,subcategory_2,subcategory_3,is_brand_missing,is_item_description_missing,price,price_log
78925,1210035,express portofino blouse size m,2,Women/Tops & Blouses/Blouse,express,1.0,portofino blouse from express. only worn a couple time and in like new condition. size medium. free shipping!,18.0,5.0,women,tops & blouses,blouse,0.0,0.0,9.0,2.302585


In [141]:
tmp = train_products_unique.iloc[top_n].copy().assign(similarity = top_similarity).head()
html = tmp[["name", "item_condition_id", "category_name", "brand_name", "shipping", "item_description", "is_brand_missing", "price", "similarity"]].to_html()#(classes='table table-stripped')
text_file = open(f"{search_product}_word_embedding.html", "w")
text_file.write(html)
text_file.close()

tmp

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,desc_len,name_len,subcategory_1,subcategory_2,subcategory_3,is_brand_missing,is_item_description_missing,price,price_log,similarity
431918,930847,express portofino blouse,3,Women/Tops & Blouses/Button Down Shirt,express,0.0,this shirt was hardly worn in great condition . it's a coral pink but the picture doesn't do it justice ! very soft and silky these shirts usually retail for 49.90 & up,33.0,3.0,women,tops & blouses,button down shirt,0.0,0.0,18.0,2.944439,0.768783
244818,750417,"new women's plus size blouse, size 18",2,Women/Tops & Blouses/Blouse,susan graver,0.0,"new, no tags. women's blouse, black with blue flowers. it's a 2 piece, see last 2 pictures",17.0,7.0,women,tops & blouses,blouse,0.0,0.0,15.0,2.772589,0.693026
342449,1313888,women's plus size lace blouse,1,Women/Tops & Blouses/Blouse,missing,1.0,"blouse has accent studs across the front, with lace throughout the top. the blouse has a lining underneath the lace everywhere except the sleeves. great with skirts or pants available in sizes 0x, 1x,3x ask for your size, bundle and save.",41.0,5.0,women,tops & blouses,blouse,1.0,0.0,13.0,2.639057,0.687254
347572,1394180,plus size lace blouse,2,Women/Tops & Blouses/Blouse,missing,1.0,mocha color lace over same color attached cami,8.0,4.0,women,tops & blouses,blouse,1.0,0.0,18.0,2.944439,0.673676
349672,423085,(m) white express sleeveless portofino,2,Women/Tops & Blouses/Blouse,express,0.0,express portofino - sleeveless tank size medium - would fit size small in flowy style. nwot - perfect condition never worn. tank top button down dress shirt work blouse,29.0,5.0,women,tops & blouses,blouse,0.0,0.0,17.0,2.890372,0.672634


In [72]:
import joblib
product_similarity_emb_emb = {
                      'train': train, 'test': validation,
                      'product_train_embedding_matrix': product_emb_emb_train["matrix"], 
                      'products_test_vector_dict': product_emb_emb_test["product2vector"], 
                      'products_train_vector_dict': product_emb_emb_train["product2vector"],
                      'unknown_products': product_emb_emb_test["unknown"]}


joblib.dump(product_similarity_emb_emb, os.path.join(model_dump_path, "product_similarity_emb_emb.joblib"))

['./model\\product_similarity_emb_emb.joblib']