In [1]:
import time
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
import ast
import random
from ast import literal_eval

from utility.utility import generate_pairwise_dataset

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

In [2]:
class Dense(tf.keras.layers.Layer):
    def __init__(self, units, input_dims=5, **kwargs):
        super(Dense, self).__init__(name='Linear', **kwargs)
        self._supports_ragged_inputs = True
        self.units = units
        self.input_dims = input_dims

    def build(self, input_shape):
        initializer = tf.keras.initializers.he_normal()
        self.w = self.add_weight(
            shape=(self.input_dims, self.units),
            initializer=initializer,
            trainable=True)
        
        self.b = self.add_weight(
            shape=(self.units,), 
            initializer=tf.zeros_initializer, 
            trainable=True)
        
    def call(self, inputs):
        return tf.ragged.map_flat_values(tf.matmul, inputs, self.w) + self.b

In [24]:
class Word_Matching_Network(tf.keras.Model):
    def __init__(self):
        super(Word_Matching_Network, self).__init__(name='Word_Matching_Network')
        self._supports_ragged_inputs = True        
        self.Layer1 = Dense(5, input_dims=30)
        self.BN1 = tf.keras.layers.BatchNormalization()
        self.Layer2 = Dense(5, input_dims=5)
        self.BN2 = tf.keras.layers.BatchNormalization()
        self.Layer3 = Dense(1, input_dims=5)
        self.BN3 = tf.keras.layers.BatchNormalization()

    def call(self, inputs):
        x = self.Layer1(inputs)
        x = tf.ragged.map_flat_values(tf.nn.relu, x)
        x = tf.ragged.map_flat_values(self.BN1, x)
        
        x = self.Layer2(x)
        x = tf.ragged.map_flat_values(tf.nn.relu, x)
        x = tf.ragged.map_flat_values(self.BN2, x)
        
        x = self.Layer3(x)
        x = tf.ragged.map_flat_values(tf.nn.relu, x)
        return x

In [25]:
class Gating_Network(tf.keras.layers.Layer):
    def __init__(self):
        super(Gating_Network, self).__init__()
        self._supports_ragged_inputs = True

    def build(self, input_shape):
        initializer = tf.keras.initializers.RandomUniform(minval=0, maxval=1)
        self.w = self.add_weight(
            shape=(1, 1),
            name='Gating_weight',
            initializer=initializer,
            trainable=True)

#     def ragged_softmax(self, logits):
#         print(logits)
#         numerator = tf.exp(logits)
#         print(numerator)
#         denominator = tf.reduce_sum(numerator, axis=1)
#         print(denominator)
#         softmax = tf.math.divide_no_nan(numerator, tf.reshape(denominator, shape=(logits.shape[0], -1)))
#         return softmax
        
    def call(self, idf):
        g = tf.math.multiply(idf, self.w)
        softmax = tf.ragged.map_flat_values(tf.nn.softmax, g)
#         softmax = self.ragged_softmax(g)
        return softmax

In [26]:
class Score_Aggregation(tf.keras.layers.Layer):
    def __init__(self):
        super(Score_Aggregation, self).__init__(name='Score_Aggregation')
        self._supports_ragged_inputs = True

    def call(self, Z, g):
        score = tf.ragged.map_flat_values(tf.reshape, Z, shape=(-1, ))
        gating = g
        s_g_sum = tf.math.multiply(gating, score)
        return tf.math.reduce_sum(s_g_sum, axis=1)

In [27]:
class DRMM(tf.keras.Model):
    def __init__(self):
        super(DRMM, self).__init__(name='DRMM')
        self._supports_ragged_inputs = True
        self.Word_Matching_Network = Word_Matching_Network()
        self.Gating_Network = Gating_Network()
        self.Score_Aggregation = Score_Aggregation()

    def call(self, inputs, idf):
        Z = self.Word_Matching_Network(inputs)
        G = self.Gating_Network(idf)
        score = self.Score_Aggregation(Z, G)
        return score

In [28]:
class Pairwise_DRMM(tf.keras.Model):
    def __init__(self):
        super(Pairwise_DRMM, self).__init__(name='Pairwise_DRMM')
        self.drmm = DRMM()
        
    def call(self, inputs):
        positive_hist = inputs['positive_hist']
        negative_hist = inputs['negative_hist']
        query_idf = inputs['query_idf']
        
        positive = self.drmm(positive_hist, query_idf)
        negative = self.drmm(negative_hist, query_idf)
        
        return tf.concat([positive, negative], axis=0) 
    
    def predict(self, inputs):
        hist = inputs['hist']
        query_idf = inputs['query_idf']
        score = self.drmm(hist, query_idf)
        return score

In [8]:
def Pairwise_ranking_loss(y_true, y_pred):
    '''
    ignore y_true
    '''
    positive_score = tf.keras.layers.Lambda(lambda x: x[:len(x)//2], output_shape= (1,))(y_pred)
    negative_score = tf.keras.layers.Lambda(lambda x: x[len(x)//2:], output_shape= (1,))(y_pred)

    return tf.keras.backend.mean(tf.math.maximum(0., 1 - positive_score + negative_score))

In [9]:
def ndcg(rel_pred, p=None, form="linear"):
    if p==None:
        p = len(rel_pred)
    if p > len(rel_pred):
        rel_pred = np.append(rel_pred, [0]*(p - len(rel_pred)))
    
    rel_true = np.sort(rel_pred)[::-1]
    discount = 1 / (np.log2(np.arange(p) + 2))

    if form == "linear":
        idcg = np.sum(rel_true[:p] * discount)
        dcg = np.sum(rel_pred[:p] * discount)
    elif form == "exponential" or form == "exp":
        idcg = np.sum([2**x - 1 for x in rel_true[:p]] * discount)
        dcg = np.sum([2**x - 1 for x in rel_pred[:p]] * discount)
    else:
        raise ValueError("Only supported for two formula, 'linear' or 'exp'")
    
    return dcg / idcg

In [10]:
test = pd.read_csv('./data/paccr_drmm_bert_test_all.csv', converters={"query_idf"          : literal_eval,
                                                                      "idf_softmax"        : literal_eval,
                                                                      "sim_matrix"         : literal_eval,
                                                                      "query_token"        : literal_eval,
                                                                      "product_title_token": literal_eval,
                                                                      "token_ids"          : literal_eval,
                                                                      "drmm_hist"          : literal_eval,
                                                                      'token'              : literal_eval})

test['binary_relevance'] = test['median_relevance'].apply(lambda x: 0 if x <= 2 else 1)

In [11]:
df = generate_pairwise_dataset(test)
df.reset_index(inplace=True, drop=True)

HBox(children=(FloatProgress(value=0.0, max=261.0), HTML(value='')))




In [11]:
# df = pd.read_csv('./data/paccr_drmm_.csv', converters={"positive_hist": literal_eval, 
#                                                        "negative_hist": literal_eval,
#                                                        "query_idf": literal_eval})


# df = df[['query_len', 'query_preprocessed', 'positive_hist', 'negative_hist', 'query_idf']]

In [12]:
# test = pd.read_csv('./data/paccr_drmm_test.csv', converters={"hist": literal_eval,
#                                                              "query_idf": literal_eval})

In [12]:
dev_q = set(random.sample(list(df['query'].unique()), 40))
train_q = set(df['query'].unique()) - dev_q

In [13]:
# dev_q = set(random.sample(list(df['query_preprocessed'].unique()), 40))
# train_q = set(df['query_preprocessed'].unique()) - dev_q

In [14]:
train = pd.concat([df.groupby('query').get_group(name) for name in train_q]).sample(frac=1).reset_index(drop=True)
dev = pd.concat([df.groupby('query').get_group(name) for name in dev_q]).sample(frac=1).reset_index(drop=True)

In [15]:
# train = pd.concat([df.groupby('query_preprocessed').get_group(name) for name in train_q]).sample(frac=1).reset_index(drop=True)
# dev = pd.concat([df.groupby('query_preprocessed').get_group(name) for name in dev_q]).sample(frac=1).reset_index(drop=True)

In [16]:
metadata = {'query_idf'          : tf.ragged.constant(train['query_idf'], dtype=tf.float32, ragged_rank=1, name='query_idf'),
            'positive_hist'      : tf.ragged.constant(train['drmm_hist_P'], dtype=tf.float32, ragged_rank=1, name='positive_hist'),
            'negative_hist'      : tf.ragged.constant(train['drmm_hist_N'], dtype=tf.float32, ragged_rank=1, name='negative_hist')}

In [17]:
metadata_dev = {'query_idf'          : tf.ragged.constant(dev['query_idf'], dtype=tf.float32, ragged_rank=1, name='query_idf'),
                'positive_hist'      : tf.ragged.constant(dev['drmm_hist_P'], dtype=tf.float32, ragged_rank=1, name='positive_hist'),
                'negative_hist'      : tf.ragged.constant(dev['drmm_hist_N'], dtype=tf.float32, ragged_rank=1, name='negative_hist')}

In [15]:
# metadata = {'query_idf': tf.ragged.constant(train.query_idf, dtype=tf.float32, ragged_rank=1),
#             'positive_hist': tf.ragged.constant(train.positive_hist, dtype=tf.float32, ragged_rank=1),
#             'negative_hist': tf.ragged.constant(train.negative_hist, dtype=tf.float32, ragged_rank=1)}

In [16]:
# metadata_dev = {'query_idf': tf.ragged.constant(dev.query_idf, dtype=tf.float32, ragged_rank=1),
#                 'positive_hist': tf.ragged.constant(dev.positive_hist, dtype=tf.float32, ragged_rank=1),
#                 'negative_hist': tf.ragged.constant(dev.negative_hist, dtype=tf.float32, ragged_rank=1)}

In [17]:
ds = tf.data.Dataset.from_tensor_slices(metadata)
ds = ds.shuffle(buffer_size=len(train))

In [18]:
batchs = 128
ds = ds.batch(batchs).repeat()
example_batch = next(iter(ds))

In [40]:
# inputs = {'negative_hist':tf.keras.Input(shape=(None, 30), ragged=True), 
#           'positive_hist':tf.keras.Input(shape=(None, 30), ragged=True), 
#           'query_idf':tf.keras.Input(shape=(None,), ragged=True)}

# tf.keras.backend.clear_session()
# output = Pairwise_DRMM()(inputs)

# model = tf.keras.Model(inputs=inputs, outputs=output)

# total_epoch_count = 100
# batch_size = 256
# learning_rate= .1
# model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=learning_rate),
#               loss=Pairwise_ranking_loss)

# model.fit(x=metadata, y=tf.constant([0.]*len(train)), 
#           validation_data=(metadata_dev, tf.constant([0.]*len(dev))),
#           shuffle=True,
#           epochs=total_epoch_count,
#           batch_size=batch_size,
#           callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)])

In [30]:
tf.keras.backend.clear_session()
model = Pairwise_DRMM()
learning_rate = .1
print_step = 1
n=20
optimizer = tf.keras.optimizers.Adagrad(learning_rate=learning_rate)
print(Pairwise_ranking_loss(y_true=None, y_pred=model(example_batch)))

tf.Tensor(0.99999857, shape=(), dtype=float32)


In [29]:
loss_sum = 0
ndcg_sum = 0
step_history = []
loss_history = []
loss_history_dev = []
ndcg_history = []

start = time.time()
for step, batch_train in enumerate(ds):
    
    with tf.GradientTape() as tape:
        logits = model(batch_train)
        loss_value = Pairwise_ranking_loss(y_true=None, y_pred=logits)
        loss_sum += loss_value 
        
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

    if step % print_step == 0:
        current_loss_average = float(loss_sum)/print_step
        if step ==0:
            current_loss_average = loss_sum
        current_ndcg_average = float(ndcg_sum)/print_step
        
        logits_dev = model(metadata_dev)
        current_loss_average_dev = Pairwise_ranking_loss(y_true=None, y_pred=logits_dev)
        loss_sum = 0
        
        for q in dev_q:
            ndcg_test = test[test['query_preprocessed'] == q]
            metadata_ndcg = {'query_idf': tf.ragged.constant(ndcg_test['query_idf'], dtype=tf.float32, ragged_rank=1),
                             'hist': tf.ragged.constant(ndcg_test['hist'], dtype=tf.float32, ragged_rank=1)}
                
            ndcg_test['rel'] = model.predict(metadata_ndcg).numpy()
            rel_pred = list(ndcg_test.sort_values(by=['rel'], axis=0, ascending=False)['median_relevance']-1)
            ndcg_sum += ndcg(rel_pred, p=n, form="exp")
            
        current_ndcg_average = ndcg_sum/len(dev_q)
        step_history.append(step)
        loss_history.append(current_loss_average)
        loss_history_dev.append(current_loss_average_dev)
        ndcg_history.append(current_ndcg_average)
        
        print("Training loss at step %d: %.5f, dev_loss : %.5f, nDCG@20 : %.5f"% (step, 
                                                                  current_loss_average, 
                                                                  current_loss_average_dev,
                                                                  current_ndcg_average))
        
        print("Seen so far: %s train samples, learning rate: %.4f" % ((step + 1) * batchs, learning_rate))
        ndcg_sum = 0
        start = time.time()
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training loss at step 0: 0.99992, dev_loss : 1.00000, nDCG@20 : 0.74149
Seen so far: 128 train samples, learning rate: 0.1000
Training loss at step 1: 0.99977, dev_loss : 1.00000, nDCG@20 : 0.74148
Seen so far: 256 train samples, learning rate: 0.1000
Training loss at step 2: 0.99989, dev_loss : 1.00000, nDCG@20 : 0.74148
Seen so far: 384 train samples, learning rate: 0.1000
Training loss at step 3: 0.99986, dev_loss : 1.00000, nDCG@20 : 0.74080
Seen so far: 512 train samples, learning rate: 0.1000
Training loss at step 4: 0.99971, dev_loss : 1.00000, nDCG@20 : 0.74080
Seen so far: 640 train samples, learning rate: 0.1000
Training loss at step 5: 1.00001, dev_loss : 1.00000, nDCG@20 : 0.74081
Seen so far: 768 train samples, learning rate: 0.1000
Training loss at step 6: 0.99997, dev_loss : 1.00000, nDCG@20 : 0.74080
Seen so far: 896 train samples, learning rate: 0.1000
Training loss at step 7: 0.99983, dev_loss : 1.00000, nDCG@20 : 0.74081
Seen so far: 1024 train samples, learning rate

KeyboardInterrupt: 