In [1]:
import pprint

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [2]:
class DCN(tfrs.Model):

    def __init__(self, use_cross_layer, deep_layer_sizes, projection_dim=None):
        super().__init__()

        self.embedding_dimension = 32
        int_features = ["feature1", "feature2", "feature3", "feature4", "feature5", "feature6", "feature7", "feature8"]
        str_features = ["zpid_hash"]
        self._all_features = int_features + str_features
        self._embeddings = {}
        
        for feature_name in str_features:
            vocabulary = vocabularies[feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential(
          [tf.keras.layers.StringLookup(
            vocabulary=vocabulary, mask_token=None),
           tf.keras.layers.Embedding(len(vocabulary) + 1,
                                     self.embedding_dimension)
    ])

        # Compute embeddings for int features.
        for feature_name in int_features:
            vocabulary = vocabularies[feature_name]
            self._embeddings[feature_name] = tf.keras.Sequential(
              [tf.keras.layers.IntegerLookup(
                  vocabulary=vocabulary, mask_value=None),
               tf.keras.layers.Embedding(len(vocabulary) + 1,
                                         self.embedding_dimension)
        ])

        if use_cross_layer:
            self._cross_layer = tfrs.layers.dcn.Cross(
              projection_dim=projection_dim,
              kernel_initializer="glorot_uniform")
        else:
            self._cross_layer = None

        self._deep_layers = [tf.keras.layers.Dense(layer_size, activation="relu")
          for layer_size in deep_layer_sizes]

        self._logit_layer = tf.keras.layers.Dense(1)

       
        self.task = tfrs.tasks.Ranking(
      loss=tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError("RMSE")]
    )


    def call(self, features):
    # Concatenate embeddings
        embeddings = []
        for feature_name in self._all_features:
            embedding_fn = self._embeddings[feature_name]
            embeddings.append(embedding_fn(features[feature_name]))

        x = tf.concat(embeddings, axis=1)

        # Build Cross Network
        if self._cross_layer is not None:
            x = self._cross_layer(x)

        # Build Deep Network
        for deep_layer in self._deep_layers:
            x = deep_layer(x)

        return self._logit_layer(x)

    def compute_loss(self, features, training=False):
        labels = features.pop("weighted")
        scores = self(features)
        return self.task(
            labels=labels,
            predictions=scores
        )

In [3]:
import pandas as pd

In [4]:
df_train = pd.read_csv("capstone/merged_train.csv")

In [37]:
df_test = pd.read_csv("capstone/merged_test.csv")

In [38]:
df_test.columns = ["tracing_id", "zpid_hash", "feature1", "feature2", "feature3", "feature4", "feature5", "feature6", "feature7", "feature8","user_id_hash", "timestamp_session", "user_session_id", "order", "submit", "fav", "click"]

In [39]:
df_test.click = df_test.click.replace({ True: 1, False: 0})
df_test.submit = df_test.submit.replace({ True: 1, False: 0})
df_test.fav = df_test.fav.replace({ True: 1, False: 0})

In [40]:
df_test

Unnamed: 0,tracing_id,zpid_hash,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,user_id_hash,timestamp_session,user_session_id,order,submit,fav,click
0,62ca195944401fed240b385fa47bd155,f48d6ed4c5403495655995a3fd12f41841ec319,4166,833,83,3333,83,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,3,0,0,0
1,62ca195944401fed240b385fa47bd155,1c45210433d89c96c20fbc333f45b72ad2de69f,4166,833,83,3333,675,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,17,0,0,0
2,62ca195944401fed240b385fa47bd155,656785af4f2359802b72f5df73affbc55176ea0b,4166,500,83,3333,83,375,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,1,0,0,0
3,62ca195944401fed240b385fa47bd155,f723dabd44f495359b60e657f3df4600f1640b2,4166,833,83,3333,83,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,13,0,0,0
4,62ca195944401fed240b385fa47bd155,fcd9950e83bd2444d4b95b2b71c138e5181ff1d4,4166,833,83,3333,83,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,26,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12576027,62cb671bac41c164218b997cdd95a92a,c5ed3b7e0f9a0c7fb7c7a954fa7c4fc7957843d6,724,6896,6586,7241,4948,4982,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,0,0,0,0
12576028,62cb671bac41c164218b997cdd95a92a,7fd3d4cdda78ac39a73d55363aa8511aee95ce5e,724,344,6586,7241,4948,8,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,1,0,0,0
12576029,62cb671bac41c164218b997cdd95a92a,1703c8dfaaba798e61d9a221bf257ba1096f50d2,724,344,6586,689,4948,8,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,4,0,0,0
12576030,62cb671bac41c164218b997cdd95a92a,3121985799f9b8c40be808c24a820b4f60722c48,724,344,6586,7241,34,8,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,3,0,0,1


In [41]:
df_test["weighted"] =df_test["submit"] + 0.5*df_test["fav"] + 0.2*df_test["click"]

In [10]:
df_test.drop(columns=[ "user_session_id","user_id_hash", "timestamp_session", "order", "fav", "submit", "click", "tracing_id"], inplace=True)

In [11]:
df_test

Unnamed: 0,zpid_hash,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,weighted
0,f48d6ed4c5403495655995a3fd12f41841ec319,4166,833,83,3333,83,166,1833,3,0.0
1,1c45210433d89c96c20fbc333f45b72ad2de69f,4166,833,83,3333,675,166,1833,3,0.0
2,656785af4f2359802b72f5df73affbc55176ea0b,4166,500,83,3333,83,375,1833,3,0.0
3,f723dabd44f495359b60e657f3df4600f1640b2,4166,833,83,3333,83,166,1833,3,0.0
4,fcd9950e83bd2444d4b95b2b71c138e5181ff1d4,4166,833,83,3333,83,166,1833,3,0.0
...,...,...,...,...,...,...,...,...,...,...
12576027,c5ed3b7e0f9a0c7fb7c7a954fa7c4fc7957843d6,724,6896,6586,7241,4948,4982,6620,6586,0.0
12576028,7fd3d4cdda78ac39a73d55363aa8511aee95ce5e,724,344,6586,7241,4948,8,6620,6586,0.0
12576029,1703c8dfaaba798e61d9a221bf257ba1096f50d2,724,344,6586,689,4948,8,6620,6586,0.0
12576030,3121985799f9b8c40be808c24a820b4f60722c48,724,344,6586,7241,34,8,6620,6586,0.2


In [12]:
df_train.columns = ["tracing_id", "zpid_hash", "feature1", "feature2", "feature3", "feature4", "feature5", "feature6", "feature7", "feature8","user_id_hash", "timestamp_session", "user_session_id", "order", "submit", "fav", "click"]
df_train.click = df_train.click.replace({ True: 1, False: 0})
df_train.fav = df_train.fav.replace({ True: 1, False: 0})
df_train.submit = df_train.submit.replace({ True: 1, False: 0})
df_train["weighted"] =df_train["submit"] + 0.5*df_train["fav"] + 0.2*df_train["click"]
df_train.drop(columns=[ "user_session_id","user_id_hash", "timestamp_session", "order", "fav", "submit", "click", "tracing_id"], inplace=True)


In [13]:
feature_names = ["zpid_hash","feature1", "feature2", "feature3", "feature4", "feature5", "feature6", "feature7", "feature8"]

vocabularies = {}

for feature_name in feature_names:
  vocab = df_train[feature_name]
  vocabularies[feature_name] = np.unique((list(vocab)))

In [15]:
dataset = tf.data.Dataset.from_tensor_slices(dict(df_train))

In [16]:
test = tf.data.Dataset.from_tensor_slices(dict(df_test))

In [17]:
dataset = dataset.map(lambda x: {
    "weighted": x["weighted"],
    "feature1": x["feature1"],
    "feature2": x["feature2"],
    "feature3": x["feature3"],
    "feature4": x["feature4"],
    "feature5": x["feature5"],
    "feature6": x["feature6"],
    "feature7": x["feature7"],
    "feature8": x["feature8"],
    "zpid_hash": x["zpid_hash"]
})

In [18]:
test = test.map(lambda x: {
    "weighted": x["weighted"],
    "feature1": x["feature1"],
    "feature2": x["feature2"],
    "feature3": x["feature3"],
    "feature4": x["feature4"],
    "feature5": x["feature5"],
    "feature6": x["feature6"],
    "feature7": x["feature7"],
    "feature8": x["feature8"],
     "zpid_hash": x["zpid_hash"]
})

In [19]:
cached_train = dataset.batch(8192).cache()

In [20]:
cached_test = test.batch(4069).cache()

In [21]:
def run_models(use_cross_layer, deep_layer_sizes, projection_dim=None, num_runs=5):
    models = []
    rmses = []
    ndcg = []

    for i in range(num_runs):
        model = DCN(use_cross_layer=use_cross_layer,
                deep_layer_sizes=deep_layer_sizes,
                projection_dim=projection_dim)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
        models.append(model)

        model.fit(cached_train, epochs=epochs, verbose=False)
        metrics = model.evaluate(cached_test, return_dict=True)
        print(metrics)
        rmses.append(metrics["RMSE"])
        predictions = model.predict(cached_test)
    mean, stdv = np.average(rmses), np.std(rmses)

    return {"model": models, "mean": mean, "stdv": stdv, "predict": predictions}

In [22]:
epochs = 8
learning_rate = 0.001

In [None]:
dcn_lr_result = run_models(use_cross_layer=True,
                           deep_layer_sizes=[192, 192])

In [None]:
dcn_lr_result

In [None]:
predictions = dcn_lr_result['predict']

In [26]:
predictions

array([[0.04821152],
       [0.01270333],
       [0.02027745],
       ...,
       [0.00518483],
       [0.21039227],
       [0.2069574 ]], dtype=float32)

In [46]:
dnn_result = run_models(use_cross_layer=False,
                        deep_layer_sizes=[192, 192, 192])

{'RMSE': 0.05385212600231171, 'loss': 0.0014648736687377095, 'regularization_loss': 0, 'total_loss': 0.0014648736687377095}
{'RMSE': 0.054738450795412064, 'loss': 0.0013800723245367408, 'regularization_loss': 0, 'total_loss': 0.0013800723245367408}
{'RMSE': 0.054417356848716736, 'loss': 0.0014462133403867483, 'regularization_loss': 0, 'total_loss': 0.0014462133403867483}
{'RMSE': 0.05415380746126175, 'loss': 0.0016184627311304212, 'regularization_loss': 0, 'total_loss': 0.0016184627311304212}
{'RMSE': 0.054048679769039154, 'loss': 0.0013210074976086617, 'regularization_loss': 0, 'total_loss': 0.0013210074976086617}


In [47]:
predictions_dnn = dnn_result['predict']

In [48]:
predictions_dnn

array([[0.03641737],
       [0.01125306],
       [0.01983773],
       ...,
       [0.00343527],
       [0.00426978],
       [0.1861428 ]], dtype=float32)

In [23]:
dcn_result = run_models(use_cross_layer=True,
                        deep_layer_sizes=[192, 192])

{'RMSE': 0.05233219638466835, 'loss': 0.001599677256308496, 'regularization_loss': 0, 'total_loss': 0.001599677256308496}
{'RMSE': 0.05392654240131378, 'loss': 0.00149285770021379, 'regularization_loss': 0, 'total_loss': 0.00149285770021379}
{'RMSE': 0.052635014057159424, 'loss': 0.0015930675435811281, 'regularization_loss': 0, 'total_loss': 0.0015930675435811281}
{'RMSE': 0.05293837934732437, 'loss': 0.0014416857156902552, 'regularization_loss': 0, 'total_loss': 0.0014416857156902552}
{'RMSE': 0.052001651376485825, 'loss': 0.0015332164475694299, 'regularization_loss': 0, 'total_loss': 0.0015332164475694299}


In [24]:
predictions_dcn = dcn_result['predict']

In [25]:
predictions_dcn

array([[0.02129615],
       [0.0259221 ],
       [0.01616249],
       ...,
       [0.00198418],
       [0.01206164],
       [0.192159  ]], dtype=float32)

In [27]:
max(predictions_dcn)

array([1.7415208], dtype=float32)

In [4]:
df_1 = pd.read_csv("pred_dcn.csv")

In [5]:
df_1

Unnamed: 0.1,Unnamed: 0,tracing_id,zpid_hash,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,weighted,pred,rank
0,7193405,62cacd2576d26a82ebae7a7c3266a3f3,bbe162c9a84395c04e624bbf1ed3a42ea2eb6292,3846,769,76,3846,258,153,153,76,1.7,0.027654,36.0
1,7422659,62ca4372abb4be53a0fa928c1e480fb4,2f6e6e425a11524623696f48de067c5991428b3f,9411,637,5,8611,9,447,11,5,1.7,0.150375,27.0
2,1540576,62cacaa18460644957aecac2dc6d7911,b2616eb4e46146da804cc2ff50fc0fd1c66b55a,4545,909,90,1818,90,181,181,90,1.7,0.016162,26.0
3,4827285,62ca26512a45c23f07ca621cbeb6e8b6,25556fcd6fb95efd3154d0c19ff7465238a726b,2895,6289,7059,7263,2493,1339,87,7041,1.7,0.290849,25.0
4,10417621,62ca9af507ef0e57fd4035fa395cf7e8,31ac6c878ae86641c121f47998be8acde8b8fd56,6666,58,2865,3539,66,133,2339,1458,1.7,0.090829,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12576027,4343957,62ca768d7787b2848d262188a8f55bb9,cec422251b7469d2e0945ce5852249014757b827,4545,1818,90,2727,90,181,181,90,0.0,0.010115,19.0
12576028,4343958,62ca768d7787b2848d262188a8f55bb9,902a9086130c58e79cd57409b9dad1f3d3f2ee8,4545,909,90,2727,90,181,181,90,0.0,0.004656,5.0
12576029,4343959,62ca768d7787b2848d262188a8f55bb9,3cafeb918c00c149b301a83c9433cb4e62ed9010,4545,1818,90,2727,90,181,181,90,0.0,0.009386,18.0
12576030,4343961,62ca768d7787b2848d262188a8f55bb9,67e05e81948e42955ae5ea40338394f62a338cf2,4545,909,90,2727,90,181,181,90,0.0,0.006129,10.0


In [6]:
df_pred = df_1.sort_values("weighted", ascending = True)

In [31]:
df_pred["rank"] = df_pred.groupby("tracing_id")["pred"].rank(method = "first", ascending = False)

In [51]:
df_pred = df_pred[df_pred["pred"]>0]

In [52]:
pred = df_pred.groupby("tracing_id")["rank"].apply(list).to_frame().reset_index()

In [53]:
pred

Unnamed: 0,tracing_id,rank
0,62ca193dc36929bb30f091105b8e5bd1,"[36.0, 7.0, 23.0, 39.0, 27.0, 21.0, 35.0, 18.0..."
1,62ca193dcfc19c8d05fb5d9ef28d608e,"[8.0, 24.0, 4.0, 6.0, 30.0, 26.0, 33.0, 25.0, ..."
2,62ca193dd0a6155c1595dd3ab61852ae,"[11.0, 21.0, 10.0, 34.0, 12.0, 26.0, 17.0, 19...."
3,62ca193dd4eff6309eb1cd23e3d0df62,"[32.0, 39.0, 21.0, 30.0, 15.0, 13.0, 24.0, 4.0..."
4,62ca193dd968bf1881b0fdf86252a0f1,"[9.0, 3.0, 4.0, 5.0, 31.0, 40.0, 37.0, 26.0, 2..."
...,...,...
393483,62cb67ebcdcd215bd2b3713c1c99c152,"[8.0, 4.0, 22.0, 15.0, 17.0, 5.0, 12.0, 10.0, ..."
393484,62cb67eeb312a541990e794692ce2a4a,"[8.0, 17.0, 10.0, 21.0, 9.0, 15.0, 32.0, 20.0,..."
393485,62cb67ef76d830cf4b0c83d553cd0043,"[4.0, 17.0, 16.0, 10.0, 1.0, 12.0, 37.0, 21.0,..."
393486,62cb67f1265d7ed45bd56d5bf981af4a,"[23.0, 6.0, 20.0, 12.0, 19.0, 5.0, 10.0, 14.0,..."


In [60]:
ndcg_20 = []
for i in range(len(pred)):
    ndcg_20.append(ndcg_at_k(pred["rank"][i],20))
sum(ndcg_20)/len(ndcg_20)

NameError: name 'ndcg_at_k' is not defined

In [132]:
ndcg_10 = []
for i in range(len(pred)):
    ndcg_10.append(ndcg_at_k(pred["rank"][i],10))
sum(ndcg_10)/len(ndcg_10)

0.6341175758043659

In [133]:
ndcg_5 = []
for i in range(len(pred)):
    ndcg_5.append(ndcg_at_k(pred["rank"][i],5))
sum(ndcg_5)/len(ndcg_5)

0.5976326299220915

In [134]:
ndcg_2 = []
for i in range(len(pred)):
    ndcg_2.append(ndcg_at_k(pred["rank"][i],2))
sum(ndcg_2)/len(ndcg_2)

0.5693574899361376

In [135]:
ndcg_1 = []
for i in range(len(pred)):
    ndcg_1.append(ndcg_at_k(pred["rank"][i],1))
sum(ndcg_1)/len(ndcg_1)

0.5518604978043662

In [113]:
df_test["pred"] = predictions_dnn

In [114]:
df_pred_1 = df_test.sort_values("weighted", ascending = True)

In [115]:
df_pred_1["rank"] = df_pred_1.groupby("tracing_id")["pred"].rank(method = "first", ascending = False)

In [116]:
pred_1 = df_pred_1.groupby("tracing_id")["rank"].apply(list).to_frame().reset_index()

In [118]:
ndcg_20 = []
for i in range(len(pred_1)):
    ndcg_20.append(ndcg_at_k(pred_1["rank"][i],20))
sum(ndcg_20)/len(ndcg_20)

0.6954248973636556

In [119]:
ndcg_10 = []
for i in range(len(pred_1)):
    ndcg_10.append(ndcg_at_k(pred_1["rank"][i],10))
sum(ndcg_10)/len(ndcg_10)

0.630166495869624

In [120]:
ndcg_5 = []
for i in range(len(pred_1)):
    ndcg_5.append(ndcg_at_k(pred_1["rank"][i],5))
sum(ndcg_5)/len(ndcg_5)

0.5936862969332165

In [121]:
ndcg_2 = []
for i in range(len(pred_1)):
    ndcg_2.append(ndcg_at_k(pred_1["rank"][i],2))
sum(ndcg_2)/len(ndcg_2)

0.5652670174362652

In [122]:
ndcg_1 = []
for i in range(len(pred_1)):
    ndcg_1.append(ndcg_at_k(pred_1["rank"][i],1))
sum(ndcg_1)/len(ndcg_1)

0.5479988747423393

In [42]:
df_test["pred"] = predictions_dcn

In [43]:
df_test

Unnamed: 0,tracing_id,zpid_hash,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,user_id_hash,timestamp_session,user_session_id,order,submit,fav,click,weighted,pred
0,62ca195944401fed240b385fa47bd155,f48d6ed4c5403495655995a3fd12f41841ec319,4166,833,83,3333,83,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,3,0,0,0,0.0,0.021296
1,62ca195944401fed240b385fa47bd155,1c45210433d89c96c20fbc333f45b72ad2de69f,4166,833,83,3333,675,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,17,0,0,0,0.0,0.025922
2,62ca195944401fed240b385fa47bd155,656785af4f2359802b72f5df73affbc55176ea0b,4166,500,83,3333,83,375,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,1,0,0,0,0.0,0.016162
3,62ca195944401fed240b385fa47bd155,f723dabd44f495359b60e657f3df4600f1640b2,4166,833,83,3333,83,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,13,0,0,0,0.0,0.007282
4,62ca195944401fed240b385fa47bd155,fcd9950e83bd2444d4b95b2b71c138e5181ff1d4,4166,833,83,3333,83,166,1833,3,591451d586ff228f1766748cb9ed2ba92847c237,2022-07-10 00:12:09.977,1d6fdd41f1f74ed9b13a7c0de133ac11_2022_07_10_00_1,26,0,0,0,0.0,0.013885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12576027,62cb671bac41c164218b997cdd95a92a,c5ed3b7e0f9a0c7fb7c7a954fa7c4fc7957843d6,724,6896,6586,7241,4948,4982,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,0,0,0,0,0.0,0.005514
12576028,62cb671bac41c164218b997cdd95a92a,7fd3d4cdda78ac39a73d55363aa8511aee95ce5e,724,344,6586,7241,4948,8,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,1,0,0,0,0.0,0.001836
12576029,62cb671bac41c164218b997cdd95a92a,1703c8dfaaba798e61d9a221bf257ba1096f50d2,724,344,6586,689,4948,8,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,4,0,0,0,0.0,0.001984
12576030,62cb671bac41c164218b997cdd95a92a,3121985799f9b8c40be808c24a820b4f60722c48,724,344,6586,7241,34,8,6620,6586,4eef169b8a206fb50a0392cd528f8e7ce5ab834f,2022-07-10 23:56:11.441,33ffbeaefa3d4d168e1221a0b74b1d04_2022_07_10_23_1,3,0,0,1,0.2,0.012062


In [44]:
df_test.drop(columns=[ "user_session_id","user_id_hash", "timestamp_session", "order", "fav", "submit", "click"], inplace=True)

In [45]:
df_test

Unnamed: 0,tracing_id,zpid_hash,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,weighted,pred
0,62ca195944401fed240b385fa47bd155,f48d6ed4c5403495655995a3fd12f41841ec319,4166,833,83,3333,83,166,1833,3,0.0,0.021296
1,62ca195944401fed240b385fa47bd155,1c45210433d89c96c20fbc333f45b72ad2de69f,4166,833,83,3333,675,166,1833,3,0.0,0.025922
2,62ca195944401fed240b385fa47bd155,656785af4f2359802b72f5df73affbc55176ea0b,4166,500,83,3333,83,375,1833,3,0.0,0.016162
3,62ca195944401fed240b385fa47bd155,f723dabd44f495359b60e657f3df4600f1640b2,4166,833,83,3333,83,166,1833,3,0.0,0.007282
4,62ca195944401fed240b385fa47bd155,fcd9950e83bd2444d4b95b2b71c138e5181ff1d4,4166,833,83,3333,83,166,1833,3,0.0,0.013885
...,...,...,...,...,...,...,...,...,...,...,...,...
12576027,62cb671bac41c164218b997cdd95a92a,c5ed3b7e0f9a0c7fb7c7a954fa7c4fc7957843d6,724,6896,6586,7241,4948,4982,6620,6586,0.0,0.005514
12576028,62cb671bac41c164218b997cdd95a92a,7fd3d4cdda78ac39a73d55363aa8511aee95ce5e,724,344,6586,7241,4948,8,6620,6586,0.0,0.001836
12576029,62cb671bac41c164218b997cdd95a92a,1703c8dfaaba798e61d9a221bf257ba1096f50d2,724,344,6586,689,4948,8,6620,6586,0.0,0.001984
12576030,62cb671bac41c164218b997cdd95a92a,3121985799f9b8c40be808c24a820b4f60722c48,724,344,6586,7241,34,8,6620,6586,0.2,0.012062


In [102]:
df_pred = df_test.sort_values("weighted", ascending = True)

In [103]:
df_pred["rank"] = df_pred.groupby("tracing_id")["pred"].rank(method = "first", ascending = False)

In [48]:
df_pred.to_csv("pred_dcn.csv")

In [104]:
pred = df_pred.groupby("tracing_id")["rank"].apply(list).to_frame().reset_index()

In [105]:
pred

Unnamed: 0,tracing_id,rank
0,62ca193dc36929bb30f091105b8e5bd1,"[32.0, 27.0, 15.0, 30.0, 29.0, 17.0, 10.0, 6.0..."
1,62ca193dcfc19c8d05fb5d9ef28d608e,"[5.0, 27.0, 13.0, 19.0, 35.0, 14.0, 38.0, 2.0,..."
2,62ca193dd0a6155c1595dd3ab61852ae,"[3.0, 28.0, 35.0, 1.0, 8.0, 37.0, 36.0, 12.0, ..."
3,62ca193dd4eff6309eb1cd23e3d0df62,"[27.0, 40.0, 32.0, 3.0, 12.0, 21.0, 33.0, 29.0..."
4,62ca193dd968bf1881b0fdf86252a0f1,"[1.0, 39.0, 33.0, 17.0, 34.0, 38.0, 18.0, 27.0..."
...,...,...
393507,62cb67ebcdcd215bd2b3713c1c99c152,"[18.0, 4.0, 17.0, 14.0, 5.0, 7.0, 9.0, 8.0, 19..."
393508,62cb67eeb312a541990e794692ce2a4a,"[30.0, 10.0, 29.0, 8.0, 1.0, 13.0, 14.0, 17.0,..."
393509,62cb67ef76d830cf4b0c83d553cd0043,"[31.0, 17.0, 25.0, 27.0, 22.0, 15.0, 14.0, 23...."
393510,62cb67f1265d7ed45bd56d5bf981af4a,"[7.0, 19.0, 11.0, 12.0, 13.0, 6.0, 22.0, 24.0,..."


In [106]:
def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


In [55]:
df_ground = df_test.sort_values("weighted", ascending = False)

In [56]:
ground = df_ground.groupby("tracing_id")["zpid_hash"].apply(list).to_frame().reset_index()

In [75]:
merged = ground.merge(pred, how="inner", on="tracing_id")

AttributeError: 'list' object has no attribute 'merge'

In [83]:
pred = merged['zpid_hash_x'].to_list()

In [84]:
ground = merged['zpid_hash_y'].to_list()

In [77]:
merged

Unnamed: 0,tracing_id,zpid_hash_x,zpid_hash_y
0,62ca193dc36929bb30f091105b8e5bd1,"[21844, 402364, 895156, 719288, 108578, 81243,...","[719288, 572133, 2769, 869974, 302856, 21844, ..."
1,62ca193dcfc19c8d05fb5d9ef28d608e,"[6135, 155093, 956426, 377344, 659494, 857493,...","[6135, 534394, 956426, 73279, 654907, 264840, ..."
2,62ca193dd0a6155c1595dd3ab61852ae,"[584906, 255226, 511356, 184942, 357189, 70236...","[250126, 290210, 810054, 750810, 204631, 88168..."
3,62ca193dd4eff6309eb1cd23e3d0df62,"[230161, 349413, 178709, 57155, 87085, 655063,...","[453507, 580258, 349413, 397299, 918005, 43093..."
4,62ca193dd968bf1881b0fdf86252a0f1,"[602989, 746236, 337223, 846028, 620880, 55142...","[436734, 602989, 763919, 243217, 620880, 40497..."
...,...,...,...
393507,62cb67ebcdcd215bd2b3713c1c99c152,"[692645, 594822, 108315, 576950, 586674, 51920...","[566149, 909685, 477748, 519203, 96491, 803197..."
393508,62cb67eeb312a541990e794692ce2a4a,"[90562, 449701, 227910, 909632, 649214, 393767...","[765353, 137202, 909632, 659860, 449701, 30897..."
393509,62cb67ef76d830cf4b0c83d553cd0043,"[738094, 821660, 284918, 810972, 727046, 41051...","[191127, 632397, 738094, 810972, 821660, 69547..."
393510,62cb67f1265d7ed45bd56d5bf981af4a,"[311175, 499425, 112898, 144927, 432374, 40690...","[112898, 415758, 311175, 878569, 40690, 291263..."


In [51]:
import numpy as np

def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Discounted cumulative gain
    """
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)
    Relevance is positive real values.  Can use binary
    as the previous methods.
    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0
    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]
    Returns:
        Normalized discounted cumulative gain
    """
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [52]:
ndcg_20 = []
for i in range(len(pred)):
    ndcg_20.append(ndcg_at_k(pred["rank"][i],20))

In [53]:
sum(ndcg_20)/len(ndcg_20)

0.7336023204358162

In [54]:
ndcg_10 = []
for i in range(len(pred)):
    ndcg_10.append(ndcg_at_k(pred["rank"][i],10))

In [55]:
sum(ndcg_10)/len(ndcg_10)

0.6923964842676369

In [56]:
ndcg_5 = []
for i in range(len(pred)):
    ndcg_5.append(ndcg_at_k(pred["rank"][i],5))

In [57]:
sum(ndcg_5)/len(ndcg_5)

0.6901610319756734

In [None]:
ndcg_2 = []
for i in range(len(pred)):
    ndcg_2.append(ndcg_at_k(pred["rank"][i],2))

In [None]:
sum(ndcg_2)/len(ndcg_2)

In [None]:
ndcg_1 = []
for i in range(len(pred)):
    ndcg_1.append(ndcg_at_k(pred["rank"][i],1))