In [4]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from ast import literal_eval
from model.loss import Pairwise_ranking_loss
from model.drmm import Gen_DRMM_Model
from model.pacrr import Gen_PACRR_Model
from model.pacrr_drmm import Gen_PACRR_DRMM_Model
from model.callback import LossHistory, _metric

from utility.utility import ndcg, mAP_score, highlight, history_plot
from IPython.core.display import display, HTML

pd.options.mode.chained_assignment = None

display(HTML("<style>.container { width:90% !important; }</style>"))
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 99)
pd.set_option('display.max_rows', 50)

%matplotlib inline

ModuleNotFoundError: No module named 'model'

In [2]:
df = pd.read_csv('./data/paccr_drmm_all.csv', converters={"positive_hist": literal_eval,
                                                          "negative_hist": literal_eval,
                                                          "query_idf": literal_eval,
                                                          "negative_sim_matrix": literal_eval,
                                                          "positive_sim_matrix": literal_eval,
                                                          "idf_softmax": literal_eval})


df = df[['query_preprocessed', 'positive_hist', 'negative_hist', 'query_idf', 'negative_sim_matrix', 'positive_sim_matrix', 'idf_softmax']]

In [3]:
test = pd.read_csv('./data/paccr_drmm_all_test.csv', converters={"hist": literal_eval,
                                                                 "query_idf": literal_eval,
                                                                 "sim_matrix": literal_eval,
                                                                 "idf_softmax": literal_eval})
test['binary_relevance'] = test['median_relevance'].apply(lambda x: 0 if x <= 2 else 1)

In [4]:
dev_q = set(random.sample(list(df['query_preprocessed'].unique()), 40))
train_q = set(df['query_preprocessed'].unique()) - dev_q

In [5]:
train = df[df['query_preprocessed'].isin(train_q)]
dev = df[df['query_preprocessed'].isin(dev_q)]

In [6]:
metadata = {'negative_sim_matrix': tf.constant(train['negative_sim_matrix'].tolist(), dtype=tf.float32), 
            'positive_sim_matrix': tf.constant(train['positive_sim_matrix'].tolist(), dtype=tf.float32),
            'idf_softmax': tf.constant(train['idf_softmax'].tolist(), dtype=tf.float32),
            'query_idf': tf.ragged.constant(train['query_idf'], dtype=tf.float32, ragged_rank=1),
            'positive_hist': tf.ragged.constant(train['positive_hist'], dtype=tf.float32, ragged_rank=1),
            'negative_hist': tf.ragged.constant(train['negative_hist'], dtype=tf.float32, ragged_rank=1)}

In [7]:
metadata_dev = {'negative_sim_matrix': tf.constant(dev['negative_sim_matrix'].tolist(), dtype=tf.float32),
                'positive_sim_matrix': tf.constant(dev['positive_sim_matrix'].tolist(), dtype=tf.float32),
                'idf_softmax': tf.constant(dev['idf_softmax'].tolist(), dtype=tf.float32),
                'query_idf': tf.ragged.constant(dev['query_idf'], dtype=tf.float32, ragged_rank=1),
                'positive_hist': tf.ragged.constant(dev['positive_hist'], dtype=tf.float32, ragged_rank=1),
                'negative_hist': tf.ragged.constant(dev['negative_hist'], dtype=tf.float32, ragged_rank=1)}

In [8]:
firstk = 8
lq = 6
lg = 5
nf = 32
ns = 2

In [9]:
drmm = Gen_DRMM_Model()

drmm.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=.1),
             loss=Pairwise_ranking_loss)

history_drmm = LossHistory()
drmm_metric = _metric(test)

total_epoch_count = 100
batch_size = 256
drmm.fit(x=metadata, y=tf.constant([0.]*len(train)),
         validation_data=(metadata_dev, tf.constant([0.]*len(dev))),
         shuffle=True,
         epochs=total_epoch_count,
         batch_size=batch_size,
         callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
                    history_drmm, drmm_metric])

Epoch 1/100


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


<tensorflow.python.keras.callbacks.History at 0x1960cf72648>

In [11]:
history_plot(history_drmm, drmm_metric, batch_size, df=train)

In [None]:
pacrr = Gen_PACRR_Model(firstk, lq, lg, nf, ns)

pacrr.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=.1),
              loss=Pairwise_ranking_loss)

history_pacrr = LossHistory()
pacrr_metric = _metric(test)
total_epoch_count = 30
batch_size=128
pacrr.fit(x=metadata, y=tf.constant([0.]*len(train)), 
          validation_data=(metadata_dev, tf.constant([0.]*len(dev))),
          shuffle=True,
          epochs=total_epoch_count,
          batch_size=batch_size,
          callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
                     history_pacrr, pacrr_metric])

In [None]:
history_plot(history_pacrr, pacrr_metric, batch_size, df=train)

In [None]:
pacrr_drmm = Gen_PACRR_DRMM_Model(firstk, lq, lg, nf, ns)

pacrr_drmm.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=.1),
                   loss=Pairwise_ranking_loss)

history_pacrr_drmm = LossHistory()
pacrr_drmm_metric = _metric(test)
total_epoch_count = 30
batch_size=128
pacrr_drmm.fit(x=metadata, y=tf.constant([0.]*len(train)),
               validation_data=(metadata_dev, tf.constant([0.]*len(dev))),
               shuffle=True,
               epochs=total_epoch_count,
               batch_size=batch_size,
               callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
                          history_pacrr_drmm, pacrr_drmm_metric])

In [None]:
history_plot(history_pacrr_drmm, pacrr_drmm_metric, batch_size, df=train)

In [None]:
# ndcg_test = test[test['query'] == 'acoustic guitar clamp'].head(20)
ndcg_test = test[test['query_preprocessed'] == random.sample(dev_q, 1)[0]]

metadata_ndcg = {'negative_sim_matrix' : tf.constant(ndcg_test['sim_matrix'].tolist(), dtype=tf.float32),
                 'positive_sim_matrix' : tf.constant(ndcg_test['sim_matrix'].tolist(), dtype=tf.float32),
                 'idf_softmax'         : tf.constant(ndcg_test['idf_softmax'].tolist(), dtype=tf.float32),
                 'query_idf'           : tf.ragged.constant(ndcg_test['query_idf'], dtype=tf.float32, ragged_rank=1),
                 'positive_hist'       : tf.ragged.constant(ndcg_test['hist'], dtype=tf.float32, ragged_rank=1),
                 'negative_hist'       : tf.ragged.constant(ndcg_test['hist'], dtype=tf.float32, ragged_rank=1)}

ndcg_test.insert(5, 'rel_drmm', drmm.predict(metadata_ndcg)[:len(ndcg_test)], True)
ndcg_test.insert(5, 'rel_pacrr', pacrr.predict(metadata_ndcg)[:len(ndcg_test)], True)
ndcg_test.insert(5, 'rel_pacrr_drmm', pacrr_drmm.predict(metadata_ndcg)[:len(ndcg_test)], True)

pacrr_drmm_pred = ndcg_test.sort_values(by=['rel_pacrr_drmm'], axis=0, ascending=False)[['query', 'product_title', 'median_relevance', 'rel_pacrr_drmm', 'rel_pacrr', 'rel_drmm']]
pacrr_drmm_pred.style.apply(highlight, axis=1).set_properties(subset=['query', "product_title"], **{'text-align': 'left'}).hide_index()

In [None]:
pacrr_pred = ndcg_test.sort_values(by=['rel_pacrr'], axis=0, ascending=False)[['query', 'product_title', 'median_relevance', 'rel_pacrr_drmm', 'rel_pacrr', 'rel_drmm']]
pacrr_pred.style.apply(highlight, axis=1).set_properties(subset=['query', "product_title"], **{'text-align': 'left'}).hide_index()

In [None]:
drmm_pred = ndcg_test.sort_values(by=['rel_drmm'], axis=0, ascending=False)[['query', 'product_title', 'median_relevance', 'rel_pacrr_drmm', 'rel_pacrr', 'rel_drmm']]
drmm_pred.style.apply(highlight, axis=1).set_properties(subset=['query', "product_title"], **{'text-align': 'left'}).hide_index()