<a href="https://colab.research.google.com/github/RecoHut-Projects/recohut/blob/master/tutorials/modeling/T973437_matching_models_ml1m_tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Candidate selection (Item matching) models in Tensorflow on ML-1m

## **Step 1 - Setup the environment**

### **1.1 Install libraries**

In [None]:
!pip install tensorflow==2.5.0

In [None]:
!pip install -q -U git+https://github.com/RecoHut-Projects/recohut.git -b v0.0.5

  Building wheel for recohut (setup.py) ... [?25l[?25hdone


### **1.2 Download datasets**

In [None]:
!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

### **1.3 Import libraries**

In [None]:
import os
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

In [None]:
# transforms
from recohut.transforms.datasets.movielens import create_ml_1m_dataset
from recohut.transforms.datasets.movielens import create_implicit_ml_1m_dataset

# models
from recohut.models.tf.bpr import BPR
from recohut.models.tf.ncf import NCF
from recohut.models.tf.caser import Caser
from recohut.models.tf.sasrec import SASRec
from recohut.models.tf.attrec import AttRec

### **1.4 Set params**

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
class Args:
    def __init__(self, model='bpr'):
        self.file = '/content/ml-1m/ratings.dat'
        self.epochs = 2
        self.trans_score = 1
        self.test_neg_num = 100
        self.embed_dim = 64
        self.mode = 'inner'  # dist
        self.embed_reg = 1e-6
        self.K = 10
        self.learning_rate = 0.001
        self.batch_size = 512
        self.hidden_units = [256, 128, 64]
        self.activation = 'relu'
        self.dropout = 0.2
        self.mode = 'inner'
        self.maxlen = 200
        self.hor_n = 8
        self.hor_h = 2
        self.ver_n = 4
        self.blocks = 2
        self.num_heads = 1
        self.ffn_hidden_unit = 64
        self.norm_training = True
        self.causality = False
        self.gamma = 0.5
        self.w = 0.5
        if model == 'ncf':
            self.embed_dim = 32
        elif model == 'caser':
            self.embed_dim = 50
        elif model == 'sasrec':
            self.embed_dim = 50
            self.embed_reg = 0
        elif model == 'attrec':
            self.maxlen = 5
            self.embed_dim = 100
            self.batch_size = 1024

## **Step 2 - Training & Evaluation**

In [None]:
def getHit(df, ver=1):
    """
    calculate hit rate
    :return:
    """
    if ver==1:
        df = df.sort_values('pred_y', ascending=False).reset_index()
        if df[df.true_y == 1].index.tolist()[0] < _K:
            return 1
        else:
            return 0


def getNDCG(df):
    """
    calculate NDCG
    :return:
    """
    df = df.sort_values('pred_y', ascending=False).reset_index()
    i = df[df.true_y == 1].index.tolist()[0]
    if i < _K:
        return np.log(2) / np.log(i+2)
    else:
        return 0.


def evaluate_model(model, test, K, ver=1):
    """
    evaluate model
    :param model: model
    :param test: test set
    :param K: top K
    :return: hit rate, ndcg
    """
    if ver == 1:
        if args.mode == 'inner':
            pred_y = - model.predict(test)
        else:
            pred_y = model.predict(test)
        rank = pred_y.argsort().argsort()[:, 0]
        hr, ndcg = 0.0, 0.0
        for r in rank:
            if r < K:
                hr += 1
                ndcg += 1 / np.log2(r + 2)
        return hr / len(rank), ndcg / len(rank)

    elif ver == 2:
        global _K
        _K = K
        test_X, test_y = test
        pred_y = model.predict(test_X)
        test_df = pd.DataFrame(test_y, columns=['true_y'])
        test_df['user_id'] = test_X[0]
        test_df['pred_y'] = pred_y
        tg = test_df.groupby('user_id')
        hit_rate = tg.apply(getHit).mean()
        ndcg = tg.apply(getNDCG).mean()
        return hit_rate, ndcg

### **2.1 BPR**

In [None]:
args = Args(model='bpr')

In [None]:
# ========================== Create dataset =======================
feature_columns, train, val, test = create_ml_1m_dataset(args.file, args.trans_score, args.embed_dim, args.test_neg_num)

# ============================Build Model==========================
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = BPR(feature_columns, args.mode, args.embed_reg)
    model.summary()
    # =========================Compile============================
    model.compile(optimizer=Adam(learning_rate=args.learning_rate))

results = []
for epoch in range(1, args.epochs + 1):
    # ===========================Fit==============================
    t1 = time()
    model.fit(
        train,
        None,
        validation_data=(val, None),
        epochs=1,
        batch_size=args.batch_size,
    )
    # ===========================Test==============================
    t2 = time()
    if epoch % 2 == 0:
        hit_rate, ndcg = evaluate_model(model, test, args.K)
        print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, NDCG = %.4f'
                % (epoch, t2 - t1, time() - t2, hit_rate, ndcg))
        results.append([epoch, t2 - t1, time() - t2, hit_rate, ndcg])

# ========================== Write Log ===========================
pd.DataFrame(results, columns=['Iteration', 'fit_time', 'evaluate_time', 'hit_rate', 'ndcg'])\
    .to_csv('BPR_log_dim_{}_mode_{}_K_{}.csv'.format(args.embed_dim, args.mode, args.K), index=False)



100%|██████████| 6040/6040 [00:29<00:00, 203.83it/s]


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 64)        386624      input_4[0][0]           

### **2.2 NCF**

In [None]:
args = Args(model='ncf')

In [None]:
# ========================== Create dataset =======================
feature_columns, train, val, test = create_ml_1m_dataset(args.file, args.trans_score, args.embed_dim, args.test_neg_num)

# ============================Build Model==========================
mirrored_strategy = tf.distribute.MirroredStrategy()
with mirrored_strategy.scope():
    model = NCF(feature_columns, args.hidden_units, args.dropout, args.activation, args.embed_reg)
    model.summary()
    # =========================Compile============================
    model.compile(optimizer=Adam(learning_rate=args.learning_rate))

results = []
for epoch in range(1, args.epochs + 1):
    # ===========================Fit==============================
    t1 = time()
    model.fit(
        train,
        None,
        validation_data=(val, None),
        epochs=1,
        batch_size=args.batch_size,
    )
    # ===========================Test==============================
    t2 = time()
    if epoch % 2 == 0:
        hit_rate, ndcg = evaluate_model(model, test, args.K)
        print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, NDCG = %.4f'
                % (epoch, t2 - t1, time() - t2, hit_rate, ndcg))
        results.append([epoch, t2 - t1, time() - t2, hit_rate, ndcg])
# ========================== Write Log ===========================
pd.DataFrame(results, columns=['Iteration', 'fit_time', 'evaluate_time', 'hit_rate', 'ndcg'])\
    .to_csv('NCF_log_dim_{}__K_{}.csv'.format(args.embed_dim, args.K), index=False)



100%|██████████| 6040/6040 [00:30<00:00, 201.08it/s]


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 32)        193312      input_7[0][0]                    
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 1)]          0                                   

### **2.3 Caser**

In [None]:
args = Args(model='caser')

In [None]:
# ========================== Create dataset =======================
feature_columns, train, val, test = create_implicit_ml_1m_dataset(args.file, args.trans_score, args.embed_dim, args.maxlen)
train_X, train_y = train
val_X, val_y = val

# ============================Build Model==========================
model = Caser(feature_columns, args.maxlen, args.hor_n, args.hor_h, args.ver_n, args.dropout, args.activation, args.embed_reg)
model.summary()
# =========================Compile============================
model.compile(loss=BinaryCrossentropy(), optimizer=Adam(learning_rate=args.learning_rate))

results = []
for epoch in range(1, args.epochs + 1):
    # ===========================Fit==============================
    t1 = time()
    model.fit(
        train_X,
        train_y,
        validation_data=(val_X, val_y),
        epochs=1,
        batch_size=args.batch_size,
    )
    # ===========================Test==============================
    t2 = time()
    if epoch % 2 == 0:
        hit_rate, ndcg = evaluate_model(model, test, args.K, ver=2)
        print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, NDCG= %.4f'
                % (epoch, t2 - t1, time() - t2, hit_rate, ndcg))
        results.append([epoch + 1, t2 - t1, time() - t2, hit_rate, ndcg])

# ============================Write============================
pd.DataFrame(results, columns=['Iteration', 'fit_time', 'evaluate_time', 'hit_rate', 'ndcg']).\
    to_csv('Caser_log_maxlen_{}_dim_{}_hor_n_{}_ver_n_{}_K_{}_.csv'.
            format(args.maxlen, args.embed_dim, args.hor_n, args.ver_n, args.K), index=False)



100%|██████████| 6040/6040 [00:24<00:00, 246.55it/s]


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 50)      197650      input_1[0][0]                    
__________________________________________________________________________________________________
tf.compat.v1.transpose (TFOpLam (None, 50, 200)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 199, 8)       808         embedding_1[0][0]                
______________________________________________________________________________________________

## **Closure**

For more details, you can refer to https://github.com/RecoHut-Stanzas/S021355.

<a href="https://github.com/RecoHut-Stanzas/S021355/blob/main/reports/S021355.ipynb" alt="S021355_Report"> <img src="https://img.shields.io/static/v1?label=report&message=active&color=green" /></a> <a href="https://github.com/RecoHut-Stanzas/S021355" alt="S021355"> <img src="https://img.shields.io/static/v1?label=code&message=github&color=blue" /></a>

In [None]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d

Author: Sparsh A.

Last updated: 2021-12-20 15:45:40

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

pandas    : 1.1.5
tensorflow: 2.5.0
IPython   : 5.5.0
numpy     : 1.19.5



---

**END**