ver 6.0 
Supervised contrastive

# Enviroment setup

In [1]:
!pip install tensorflow-addons
# !pip install recommenders
!pip install "dask[dataframe]" --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 10.1 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.17.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fsspec>=0.6.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 7.5 MB/s 
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting locket
  Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: locket, partd, fsspec
Successfully installed fsspec-2022.5.0 locket-1.0.0 partd-1.2.0


# Libs

In [2]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import gc
import math
import datetime, time
from joblib import Parallel, delayed
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow.keras.backend as K
import tensorflow_addons as tfa
from tensorflow_addons.losses import TripletSemiHardLoss 
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.manifold import TSNE

import dask.dataframe as dd

# from recommenders.datasets.python_splitters import python_random_split
# from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
# from recommenders.models.cornac.cornac_utils import predict_ranking

In [3]:
tf.__version__

'2.8.2'

In [4]:
itemCol = 'movieId'
userCol = 'userId'

In [6]:
# DGX setup
# fpath = "./ml-20m"

#colab setup
from google.colab import drive
drive.mount('/content/gdrive')
fpath = "/content/gdrive/MyDrive/RECOMMENDER_STUDIES/data/ml-20m"

Mounted at /content/gdrive


In [7]:
# Loading movie
movies = pd.read_csv(fpath+'/movies.csv')
movies["year"]=movies["title"].apply(lambda x: x[-5:-1])
movies["genres"] = movies["genres"].apply(lambda x: ' ' if x == '(no genres listed)' else ' '.join(x.split('|')) )
movies["title"]= movies["title"].apply(lambda x: x[0:-7])
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995
1,2,Jumanji,Adventure Children Fantasy,1995
2,3,Grumpier Old Men,Comedy Romance,1995
3,4,Waiting to Exhale,Comedy Drama Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [8]:
# user-wise train-test split
def user_wise_split(userCol, test_size=0.25):
    df = pd.read_csv(fpath+'/ratings.csv')
    df["y"] = df["rating"]/2.5-1
    all_user = df[userCol].drop_duplicates()
    train_user, test_user = train_test_split(all_user, test_size=test_size)

    train_ratings = df[df[userCol].isin(train_user)]
    test_ratings = df[df[userCol].isin(test_user)]
    return train_ratings, test_ratings

train, test= user_wise_split(userCol,test_size= 0.25)

In [9]:
top_k_item = 20000
wu_size = 200
max_item = wu_size

top_items = train.groupby(itemCol).count().sort_values(by=userCol, ascending=False).head(top_k_item).index

# Class  model

In [38]:
class TimeTrachker():
    """Tracking runing time"""
    def start(self):
        self.start_time = time.time()

    def check(self, des = ''):
        try:
            if self.check_time is None:
                self.check_time = self.start_time
        except:
            self.check_time = self.start_time
        self.end_time = time.time()
        dur = self.end_time - self.check_time
        print(des + " duration: ", dur)

        self.check_time = time.time()
        return dur

    def stop(self, des = ''):
        self.end_time = time.time()
        dur = self.end_time - self.start_time
        print(des + " duration: ", dur)
        self.start_time = time.time()
        self.check_time = None
        return dur

timer = TimeTrachker()

In [39]:
def get_interaction_set(interaction, max_item = None, top_k_item = None):
    """
    Input:
        interaction: df[userCol, itemCol, y]: dữ liệu đầu vào
        max_item: int: item num limit
    Output:
        df, itemCol: list, y: list, itemCol_str: string, userCol as index
        list item sắp xếp theo giảm dần độ lớn rating
    """
    items = interaction.groupby(itemCol).count().sort_values(by=userCol, ascending=False)
    if top_k_item is not None:
        top_items = items.head(top_k_item).index
        interaction = interaction[interaction[itemCol].isin(top_items)]
    else:
        top_items = items.index

    # Sắp xếp item theo thứ tự giảm dần rating (về sau cắt padding sẽ ưu tiên giữ lại item có rating cao)
    rindex = interaction.groupby(userCol)["y"].transform(lambda grp: grp.sort_values(ascending=False).index)
    interaction = interaction.reindex(rindex)
    
    # Chuyển thành warm-up set theo từng user
    interaction = interaction.groupby("userId").agg({itemCol:list, "y":list})

    # Giới hạn độ dài warm_up size
    if max_item is not None:
        interaction[itemCol] = interaction[itemCol].apply(lambda x: x[0:max_item])
        interaction["y"] = interaction["y"].apply(lambda x: x[0:max_item])

    return interaction, top_items

In [40]:
# Bulding model
class Efficient_Rec(tf.keras.Model):
    def __init__(self, encoder, wu_size= 200, use_tf_function=False):
        super().__init__()
        self.use_tf_function = use_tf_function
        self.encoder = encoder
        self.wu_size = wu_size

    # from v2.2: chỉ groupby, không padding
    @staticmethod
    def get_interaction_set(interaction, max_item = None, top_k_item = None):
        """
        Input:
            interaction: df[userCol, itemCol, y]: dữ liệu đầu vào
            max_item: int: item num limit
        Output:
            df, itemCol: list, y: list, itemCol_str: string, userCol as index
            list item sắp xếp theo giảm dần độ lớn rating
        """
        items = interaction.groupby(itemCol).count().sort_values(by=userCol, ascending=False)
        if top_k_item is not None:
            top_items = items.head(top_k_item).index
            interaction = interaction[interaction[itemCol].isin(top_items)]
        else:
            top_items = items.index

        # Sắp xếp item theo thứ tự giảm dần rating (về sau cắt padding sẽ ưu tiên giữ lại item có rating cao)
        rindex = interaction.groupby(userCol)["y"].transform(lambda grp: grp.sort_values(ascending=False).index)
        interaction = interaction.reindex(rindex)
        
        # Chuyển thành warm-up set theo từng user
        interaction = interaction.groupby("userId").agg({itemCol:list, "y":list})

        # Giới hạn độ dài warm_up size
        if max_item is not None:
            interaction[itemCol] = interaction[itemCol].apply(lambda x: x[0:max_item])
            interaction["y"] = interaction["y"].apply(lambda x: x[0:max_item])

        return interaction, top_items

    def _preprocess(self, inputs, padding_size = 100):
        """
        Padding về wu_size và mask_size, convert list of items => string of items
        batch_inputs: df: itemStr, y"""

        def padding_list(list_item, wu_size, value=0, is_padding=True):
            series_item1 = list_item[0:wu_size]
            if is_padding:
                series_item1 = series_item1+[value]*(wu_size-len(series_item1))
            return series_item1

        items_list, ratings_list = inputs

        items   = items_list.apply(lambda x: ' '.join(list([str(i) for i in x])))
        ratings =   np.stack( ratings_list.apply(lambda x: padding_list( x, padding_size  ) ) )

        return items, ratings


    @staticmethod
    def get_top_cluster(scores, interaction_list):
        interaction_list_ = interaction_list.copy()
        idx = np.argsort(-scores.transpose(),axis=0)[:cluster_num]
        interaction_list_["clusters"] = [list(i) for i in idx.transpose()]
        interaction_list_["scores"] = [ list(scores[i][ind]) for i, ind in enumerate(idx.transpose()) ]
        return interaction_list_

    def minibatch_clustering(self, interaction_list, batch_size= 512):
        chunks = [interaction_list[i:i+batch_size] for i in range(0,interaction_list.shape[0],batch_size)]
        preds = []
        for chunk in chunks:
            pred = self.encoder(self._preprocess( [chunk[itemCol], chunk["y"]], padding_size = wu_size )).numpy()
            preds.append( pred )

        return np.concatenate(preds)

    @staticmethod
    def chunk_explode(ratings, batch_size = 1024**2):
        chunks = [ratings[i:i+batch_size] for i in range(0,ratings.shape[0],batch_size)]
        explodes = []

        # Todo: convert for loop to parallel
        def chunk_process(chunk):
            explode = chunk.explode(["clusters", "scores"])
            explode["contribute_score"] = explode["scores"].astype("float64")*explode["y"]
            explode = explode.groupby(["clusters", "movieId"]).agg({
                "contribute_score": ["mean", "count"]
            }).reset_index()
            explode.columns = ["clusters", "movieId", "mean", "count"]

            return explode

        explodes = Parallel(n_jobs = -1, verbose = 1)(
                    delayed(chunk_process)(chunk) for chunk in tqdm(chunks))
        # combine results
        gr = pd.concat(explodes, axis = 0)
        gr["product"] = gr["mean"]*gr["count"]
        gr = gr.groupby(["clusters", "movieId"]).sum().reset_index()
        gr["contribute_score"] = gr["product"]/gr["count"]
        gr = gr[["clusters", "movieId", "contribute_score"]]
        return gr

    def get_shortlist(self, ratings, interaction_list= None, limit = 500, cluster_num = 5, batch_size=512):
        timer.start()
        if interaction_list is None:
            interaction_list_, _ = self.get_interaction_set(  ratings, max_item = max_item, top_k_item = top_k_item  )
        else:
            interaction_list_ = interaction_list.copy()

        scores = self.minibatch_clustering(interaction_list_, batch_size=batch_size)

        # Limit number of cluster for each user
        interaction_list_ = self.get_top_cluster(scores, interaction_list_)
        timer.check(des = "Get cluster")

        # Get shortlist for each cluster
        ratings_ = ratings.copy().set_index("userId")
        ratings_ = ratings_[ratings_["y"]>0]
        ratings_ = ratings_.join(interaction_list_, rsuffix="_l", how = "inner")
        timer.check(des = "Join interaction")
        ratings_ = self.chunk_explode(ratings_, batch_size= 1024*200)
        timer.check(des = "Chunk explode")

        ratings_["rank"] = ratings_.groupby("clusters")["contribute_score"].rank(method='first', ascending=False)
        ratings_ = ratings_[(ratings_["rank"] <= limit)&(ratings_["contribute_score"]>0)]
        timer.check(des = "Chunk explode")

        self.shortlist = ratings_

        timer.stop(des = "Total")

    @staticmethod
    def left_anti_user_item_join( left, right ):
        """Fast left anijoin 2 dataframe by one column"""
        wu_key = left[userCol].astype('str')+"&"+left[itemCol].astype('str')
        blacklist = right[userCol].astype('str')+"&"+right[itemCol].astype('str')

        key_diff = set(wu_key).difference(blacklist)
        where_diff = wu_key.isin(key_diff)
        output = left[where_diff]

        return output


    def batch_get_recommend(self, warm_up= None, historical_ratings = None, top_k = 10, is_remove_interacted = True, 
        batch_size=1024, reduce_method="random", using_rapids= False):
        """
        reduce_method: str, 'random' or 'mean'
        """
        print("historical_ratings shape ", historical_ratings.shape)
        timer.start()
        if warm_up is None:
            warm_up, _ = self.get_interaction_set( 
                         historical_ratings
                        , max_item = max_item
                        , top_k_item = top_k_item )

        scores = self.minibatch_clustering(warm_up, batch_size=512)
        wu = self.get_top_cluster( scores, warm_up)

        wu = wu.explode(["clusters", "scores"]).reset_index()[[userCol, "clusters", "scores"]]
        timer.check(des = "Get cluster for user")
        print("wu shape ", wu.shape)
        
        user_num = wu[userCol].drop_duplicates().shape[0]
        print("user_num ", user_num)

        # cudf.from_pandas
        if using_rapids:
            wu = dd.from_pandas(wu)
            historical_ratings = dd.from_pandas(historical_ratings.copy())

        chunks = [wu[wu[userCol].isin(range(i,i+batch_size))] 
                  for i in range(0,user_num,batch_size)]

        his_chunks = [historical_ratings[historical_ratings[userCol].isin(range(i,i+batch_size))] 
                        for i in range(0,user_num,batch_size)]
       
        shortlist = self.shortlist

        def batch_join_process(chunk, his_chunk):
            timer2 = TimeTrachker()
            timer2.start()

            wu = chunk.merge(shortlist, on="clusters", how='inner')
            print('''wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape: ''', wu.shape)
            wu["matched_score"] = wu["scores"]*wu["contribute_score"]

            if reduce_method=="mean":
                wu = wu.groupby([userCol, itemCol]).agg({"matched_score":'mean'}).reset_index()
            
            if reduce_method=="random":
                # using drop_duplicates for boost speed, may reduce accuracy.
                wu = wu.drop_duplicates(subset=["userId", "movieId"])

            print('''after reduce wu shape: ''', wu.shape)
            timer2.check(des = "reduce")

            if is_remove_interacted:
                wu = self.left_anti_user_item_join( wu, historical_ratings )

            print('''after remove interacted wu shape: ''', wu.shape)
            timer2.check(des = "remove interacted item")

            wu["rank"] = wu.groupby(userCol)["matched_score"].rank(method='first', ascending=False)
            wu = wu[wu["rank"]<= top_k]

            timer2.stop(des = "all chunk processing time")

            return wu 
        
        timer.check(des = "Prepare to join")
        wus = Parallel(n_jobs = -1, backend= 'threading', verbose = 1)(
                    delayed(batch_join_process)(chunk, his_chunk) for chunk, his_chunk in tqdm(zip(chunks, his_chunks)))
        
        gc.collect()

        timer.check(des = "Join")

        output = pd.concat(wus, axis=0)

        timer.stop(des = "Total")
        return output

    def release_cache(self):
        self.interaction_list = None 
        gc.collect()

# Xây dựng encoder model
Encoder =  interaction embedding + user feature embedding<br> 
interaction embedding = sum( interaction embedding các item i)<br> 
interaction embedding item i = rating x (embedding id sản phẩm + embedding item feature)<br>



In [41]:
%%time
# Vectorize (encode + padding) item list
max_vocab_size = len(top_items) # nếu số item có <= top_k_item => lấy số lượng item max
items_str = ' '.join([str(i) for i in top_items])
itemStr = itemCol+"_str"

vectorizer = layers.TextVectorization( max_tokens= top_k_item, split='whitespace', output_sequence_length= wu_size, name = 'vectorizer')
vectorizer.adapt( [items_str] ) 

CPU times: user 4.95 s, sys: 355 ms, total: 5.3 s
Wall time: 5.67 s


In [42]:
class Broadcasting_Multiply(tf.keras.layers.Layer):
    """Nhân 2 layers khác shape với nhau, trong đó:
    inputs=[layer1, layer2]
    layer1.shape = (None, n_item, n_feature)
    layer2.shape = (None, n_item)
    (Chú ý đúng thứ tự)
    """

    def call(self, inputs):
        x, y = inputs
        deno = tf.expand_dims(tf.cast(tf.math.count_nonzero(y, axis=1), tf.float32), -1)
        #we add the extra dimension:
        y = K.expand_dims(y, axis=-1)
        #we replicate the elements
        y = K.repeat_elements(y, rep=x.shape[2], axis=-1)

        return x * y, deno

In [43]:
# Xây dựng mạng
embedding_size = 173
reps_size = 132
cluster_num = 43

@tf.function
def avg_layer(z):
    t = K.sum(z[0], axis=1)/z[1]
    t = tf.clip_by_value( t, -1, 1 )
    t = tf.where(tf.math.is_nan(t), tf.zeros_like(t), t)
    return t


def interaction_embedding():

    input_wi = layers.Input(shape=(1,), name='input_wi')
    wi = vectorizer(input_wi)
    wi = layers.Embedding(input_dim= max_vocab_size, output_dim= embedding_size, mask_zero= True, name='ei')(wi)
    # wi = layers.Dense(embedding_size, activation='sigmoid', use_bias = False, name='di')(wi)
    wi = layers.Dense(embedding_size, activation='relu', use_bias = False, name='di1')(wi)
    wi = layers.Dense(embedding_size, activation='relu', use_bias = False, name='di2')(wi)
    # wi = layers.Dense(embedding_size, activation='sigmoid', use_bias = False, name='di3')(wi)

    wr = layers.Input(shape=(wu_size,), name='warm_up_ratings')

    ireps = Broadcasting_Multiply(name='mul')([wi, wr])
    uprofile = layers.Lambda(lambda z: avg_layer(z) )(ireps)

    uprofile = layers.Dense( reps_size, activation='relu', name='du1')(uprofile)
    uprofile = layers.Dense( reps_size, activation='relu', name='du2')(uprofile)
    # uprofile = layers.Dense( reps_size, activation='relu', name='du3')(uprofile)
    # uprofile = layers.BatchNormalization(name='norm')(uprofile)
    uprofile = layers.LayerNormalization(name='norm')(uprofile)
    # uprofile = layers.Dense( reps_size, activation='relu', name='du4')(uprofile)
    uprofile = layers.Dense(cluster_num, activation='sigmoid', name='clustering')(uprofile)
    
    
    model = tf.keras.Model(inputs= [input_wi, wr], outputs=[uprofile])
    return model

In [44]:
# # Example of layer interaction embedding step by step
# input_wi = ["15 25 65 20 84",  # 5 items
#             "51 54 45 21 24 83 81 76 74 75 72 48 29 38",# 14 items
#             " ", # 0 item
#             ] 

# tvectorizer = layers.TextVectorization( max_tokens= 17, split='whitespace', output_sequence_length= 10)
# tvectorizer.adapt( input_wi ) 

# wi = tvectorizer(input_wi)
# print("afer TextVectorization layer \n",wi)
# wi = layers.Embedding(input_dim= 17, output_dim= 4, mask_zero= True, name='ei')(wi)
# print("afer Embedding layer \n",wi)
# wi = layers.Dense(3, activation='sigmoid',  use_bias = False, name='di')(wi)

# wr = np.array([[0.5, 0.1, -0.5, 1, 0.25, 0, 0, 0, 0, 0], [0.25, 0.15, 0.5, 1, 0.25, 0.5, 0.1, -0.9, 0.4, -0.3], [0,0,0,0,0,0,0,0,0,0]])

# ireps = Broadcasting_Multiply(name='mul')([wi, wr])
# print("afer Multiply layer \n",ireps)
# uprofile = layers.Lambda(lambda z: avg_layer(z) )(ireps)
# print("afer Average layer \n",uprofile)

# uprofile = layers.Dense( reps_size, activation='relu', name='du2')(uprofile)
# uprofile = layers.LayerNormalization(name='norm')(uprofile)
# uprofile = layers.Dense(5, activation='sigmoid', name='clustering')(uprofile)
# print("afer Sigmoid layer \n",uprofile)

In [45]:
# Kiểm tra tham số
# interaction_embedding().summary()

In [46]:
# tf.keras.utils.plot_model( interaction_embedding() ,show_shapes=True, show_dtype=True, show_layer_names=True )

# Evaluate model results

In [47]:
def model_evaluate(model, movies, df):
    dfu, ttop_items = get_interaction_set( df
                    , max_item = max_item
                    , top_k_item = top_k_item )
    group_scores = model.encoder(model._preprocess( [dfu[itemCol], dfu["y"]], padding_size = wu_size )).numpy()

    print("SAMPLE INTERACTION EMBEDDING")
    print( np.max(group_scores), np.mean(group_scores), np.min(group_scores) )
    print( group_scores[0:3] )

    print("FEATURE PLOT")
    feature_plot( movies, df, group_scores)
    
    print("CLUSTER CHECKING")
    check_cluster(group_scores)
    
    print("SPECTROGRAM PLOT")
    plot_spectrogram(group_scores)

def get_label(movies, df, is_encode = False):
    movies["genres_list"] = movies["genres"].apply(lambda x: x.split(' '))
    movie_genres = movies.explode("genres_list")
    gr = df.merge(movie_genres, on="movieId").groupby(["userId", "genres_list"])["movieId"].count().reset_index()
    gr["rank"] = gr.groupby("userId")["movieId"].rank(method='first', ascending=False)

    labels = gr[gr["rank"] ==1].set_index("userId")
    # labels["pred_max_ind"] = np.argmax(group_scores, axis=1)

    if is_encode:
        label_enc = LabelEncoder()
        labels["label"] = label_enc.fit_transform(labels["genres_list"])

    return labels

def feature_plot( movies, df, group_scores ):
    tlabels = get_label(movies, df)

    tsne = PCA(n_components=2, random_state=123)
    # tsne = TSNE(n_components=2, random_state=123)
    z = tsne.fit_transform(group_scores) 

    df = pd.DataFrame()
    df["y"] = tlabels["genres_list"]
    df["comp-1"] = z[:,0]
    df["comp-2"] = z[:,1]

    plt.rcParams["figure.figsize"] = (8,8)
    sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                    palette="Paired" ,#sns.color_palette("hls", 3),
                    data=df)#.set(title="Iris data T-SNE projection") 
    plt.show()

def check_cluster(group_scores):
    # Kiểm tra số user trong mỗi cụm có bị vón cục
    ugs= np.argmax(group_scores, axis=1)
    for i in range(50):
        print(i,': ', np.sum(ugs==i) )

def plot_spectrogram(group_scores):
    # Sort theo user_group + draw sigmoid/softmax layer
    plt.rcParams["figure.figsize"] = (10,10)
    k =100
    a = group_scores
    ind = np.argmax(group_scores, axis=1)
    plt.imshow( a[np.argsort(ind)][0:k] )
    plt.show()


In [48]:
def model_plot(model, movies, df):
    dfu, ttop_items = get_interaction_set( df
                    , max_item = max_item
                    , top_k_item = top_k_item )
    group_scores = model.encoder(model._preprocess( [dfu[itemCol], dfu["y"]], padding_size = wu_size )).numpy()
    print("FEATURE PLOT")
    feature_plot( movies, df, group_scores)
    

# Warm start user

In [49]:
%%time
u_train_from = 0
u_train_to = u_train_from + 130000
u_test = u_train_to + 5000

def get_labeled_data(df):
    interact_df, _ = get_interaction_set( 
                     df
                    , max_item = max_item
                    , top_k_item = top_k_item )
    labels = get_label( movies, df, is_encode = True)
    interact_df["label"] = labels["label"]
    return interact_df[["movieId","y","label"]]
try:
    train_warm
except:
    # if exists, do not rerun
    train_warm =  get_labeled_data( train[(train[userCol]>u_train_from)&(train[userCol]<u_train_to)] )
    train_warm["rating_num"] = train_warm.apply(lambda x: len(x["y"]), axis=1)
    # pretrain with warm start user
    train_warm = train_warm[train_warm["rating_num"]>=100]

print( train_warm.shape )

(37042, 4)
CPU times: user 1.59 ms, sys: 0 ns, total: 1.59 ms
Wall time: 1.61 ms


In [50]:
# x = train[["userId", "rating"]].groupby("userId").count()
# x["rating_num_clip"] = x["rating"].clip(0, 700).apply(lambda x: int(x/20)*20)
# x.groupby("rating_num_clip").count().plot()

In [51]:
# x.groupby("rating_num_clip").count()

In [52]:
gc.collect()

1009

# supervised constrastive 

In [53]:
class SupervisedContrastiveLoss(tf.keras.losses.Loss):
    def __init__(self, temperature=1, name=None):
        super(SupervisedContrastiveLoss, self).__init__(name=name)
        self.temperature = temperature

    def __call__(self, labels, feature_vectors, sample_weight=None):
        # Normalize feature vectors
        feature_vectors_normalized = tf.math.l2_normalize(feature_vectors, axis=1)
        # Compute logits
        logits = tf.divide(
            tf.matmul(
                feature_vectors_normalized, tf.transpose(feature_vectors_normalized)
            ),
            self.temperature,
        )
        return tfa.losses.npairs_loss(tf.squeeze(labels), logits)

In [54]:
# Thực hiện training
def _supervised_constrastive_train_step(self, inputs):
    items_pd, ratings_pd, labels = inputs["movieId"], inputs["y"], inputs["label"]
    items, ratings = self._preprocess((items_pd, ratings_pd), wu_size)

    with tf.GradientTape() as tape:
        # Interaction embedding
        vec = self.encoder([items, ratings])

        average_loss = self.loss(labels, vec)

    # Apply an optimization step
    variables = self.trainable_variables 
    gradients = tape.gradient(average_loss, variables)
    
    # Gradient clipping
    # gradients = [None if gradient is None else tf.clip_by_value(gradient, -0.1, 0.1)
    #              for gradient in gradients]
    self.optimizer.apply_gradients(zip(gradients, variables))

    # Return a dict mapping metric names to current value
    return {'batch_loss': average_loss}

Efficient_Rec._supervised_constrastive_train_step = _supervised_constrastive_train_step

In [55]:
# Thực hiện minibatch training
def _spv_constrastive_train_minibatch_step(self, inputs, batch_size):
    df = inputs.copy()
    chunks = [df[i:i+batch_size] for i in range(0,df.shape[0],batch_size)]
    losses = []
    for chunk in chunks:
        loss = self._supervised_constrastive_train_step(chunk)
        losses.append(loss["batch_loss"].numpy())
        print(loss)
        gc.collect()
    return np.mean(losses)

Efficient_Rec._spv_constrastive_train_minibatch_step = _spv_constrastive_train_minibatch_step

In [56]:
# Compile model
model = Efficient_Rec( encoder = interaction_embedding(), 
                      wu_size = wu_size,
                      use_tf_function=False)
model.compile(
    optimizer=tf.optimizers.Adam(learning_rate = 0.001),
    loss= TripletSemiHardLoss() # SupervisedContrastiveLoss(temperature = 0.05),
)

In [None]:
# Load trained model
latest = tf.train.latest_checkpoint("/content/gdrive/MyDrive/RECOMMENDER_STUDIES/data/")
model.encoder.load_weights(latest)
test_set = train[(train[userCol]>10000)&(train[userCol]<12500)]
model_plot(model, movies, test_set )

In [58]:
%%time
# Train new model
# epochs= 6
# test_set = train[(train[userCol]>10000)&(train[userCol]<12500)]
# model_plot(model, movies, test_set )
# for n in range(epochs):
#   print(n, "/", epochs, ": ", model._spv_constrastive_train_minibatch_step(train_warm.sample(frac=1.), batch_size=512))
#   model_plot(model, movies, test_set )
#   gc.collect()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [59]:
# model.encoder.save_weights("/content/gdrive/MyDrive/RECOMMENDER_STUDIES/data/encoder_v6",save_format='tf')

In [60]:
# model_evaluate(model, movies, 
#                warm_up_mask[(warm_up_mask[userCol]>u_train_to)&(warm_up_mask[userCol]<=u_test)])

# Pick item pipeline

In [62]:
%%time 
model.get_shortlist( 
    ratings = train[train[userCol].isin(train_warm.index)],
    interaction_list = train_warm,
    limit = 250, 
    cluster_num = 5
)


Get cluster duration:  42.458805561065674
Join interaction duration:  2.2462124824523926



  0%|          | 0/45 [00:00<?, ?it/s][A[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.

3it [26:12, 524.28s/it]

  9%|▉         | 4/45 [00:16<03:21,  4.90s/it][A
 13%|█▎        | 6/45 [00:27<03:16,  5.03s/it][A
 18%|█▊        | 8/45 [00:37<03:10,  5.15s/it][A
 22%|██▏       | 10/45 [00:48<03:01,  5.19s/it][A
 27%|██▋       | 12/45 [00:58<02:50,  5.17s/it][A
 31%|███       | 14/45 [01:11<02:52,  5.57s/it][A
 36%|███▌      | 16/45 [01:21<02:36,  5.39s/it][A
 40%|████      | 18/45 [01:31<02:22,  5.28s/it][A
 44%|████▍     | 20/45 [01:41<02:11,  5.24s/it][A
 49%|████▉     | 22/45 [01:52<02:01,  5.28s/it][A
 53%|█████▎    | 24/45 [02:02<01:48,  5.17s/it][A
 58%|█████▊    | 26/45 [02:12<01:38,  5.20s/it][A
 62%|██████▏   | 28/45 [02:23<01:28,  5.19s/it][A
 67%|██████▋   | 30/45 [02:33<01:17,  5.19s/it][A
 71%|███████   | 32/45 [02:45<01:10,  5.39s/it][A
 76%|███████▌  | 34/45 [02:55<00:58,  5.34s/it][A
 80%|████████  | 36/45 [03:06<00:47,  5.27s

Chunk explode duration:  251.65278577804565
Chunk explode duration:  0.40967369079589844
Total duration:  296.7740309238434
CPU times: user 1min 31s, sys: 7.52 s, total: 1min 39s
Wall time: 4min 57s


In [63]:
gc.collect()

2146

In [64]:
model.shortlist

Unnamed: 0,clusters,movieId,contribute_score,rank
1509,0,1559,0.999828,50.0
1517,0,1568,0.999903,20.0
1749,0,1830,0.999891,33.0
3136,0,3226,0.999931,2.0
3144,0,3234,0.999901,22.0
...,...,...,...,...
1002222,42,129415,0.656138,209.0
1002231,42,129478,0.997053,3.0
1002276,42,129913,0.713123,153.0
1002291,42,130034,0.760635,104.0


In [65]:
test_warm_up, test_mask = train_test_split(test, test_size= 0.5)

In [66]:
%%time 
# interaction_list = train_warm[train_warm.index<=3000]
top_k = 50
is_remove_interacted = True

y_pred = model.batch_get_recommend(
        historical_ratings= test_warm_up, 
        top_k = top_k, is_remove_interacted = True, batch_size=1024*5,
        reduce_method="random", using_rapids= False
    )

historical_ratings shape  (2511611, 5)
Get cluster for user duration:  44.69536876678467
wu shape  (1488832, 3)
user_num  34624
Prepare to join duration:  0.685844898223877


0it [00:00, ?it/s][Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape:  (13985750, 6)
wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape:  (14534000, 6)
after reduce wu shape:  (2111523, 7)
reduce duration:  66.78919076919556
after reduce wu shape:  (2194296, 7)
reduce duration:  70.59790992736816
after remove interacted wu shape:  (2111236, 7)
remove interacted item duration:  25.917268753051758
after remove interacted wu shape:  (2194102, 7)
remove interacted item duration:  22.672724962234497


4it [01:41, 25.35s/it]

all chunk processing time duration:  101.36686682701111
all chunk processing time duration:  101.632404088974
wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape:  (13717000, 6)
wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape:  (13523500, 6)
after reduce wu shape:  (2041734, 7)
reduce duration:  67.334712266922
after reduce wu shape:  (2070948, 7)
reduce duration:  77.73314476013184
after remove interacted wu shape:  (2041523, 7)
remove interacted item duration:  33.21745538711548
after remove interacted wu shape:  (2070760, 7)
remove interacted item duration:  23.669610738754272


7it [03:30, 30.13s/it]

all chunk processing time duration:  109.26224088668823





all chunk processing time duration:  109.75094270706177
wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape:  (13577250, 6)
wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape:  (14265250, 6)
after reduce wu shape:  (2049849, 7)
reduce duration:  61.96706676483154
after reduce wu shape:  (2153721, 7)
reduce duration:  67.2908284664154
after remove interacted wu shape:  (2049621, 7)
remove interacted item duration:  25.530102014541626
after remove interacted wu shape:  (2153516, 7)
remove interacted item duration:  24.927603483200073
all chunk processing time duration:  92.65264630317688
all chunk processing time duration:  97.42126655578613
wu = chunk.merge(shortlist, on="clusters", how='inner') : wu shape:  (13158000, 6)
after reduce wu shape:  (1986552, 7)
reduce duration:  34.46856331825256
after remove interacted wu shape:  (1986358, 7)
remove interacted item duration:  13.789148569107056
all chunk processing time duration:  52.362091302871704


[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:  5.9min finished


Join duration:  356.72791600227356
Total duration:  402.19811058044434
CPU times: user 7min 5s, sys: 13.2 s, total: 7min 18s
Wall time: 6min 42s


In [67]:
gc.collect()

50

In [68]:
y_pred

Unnamed: 0,userId,clusters,scores,movieId,contribute_score,rank,matched_score
0,3,11,0.999995,1559,0.999996,47.0,0.99999
1,3,11,0.999995,1568,0.999996,44.0,0.99999
2,3,11,0.999995,1830,0.999996,35.0,0.999991
3,3,11,0.999995,3226,0.999997,17.0,0.999992
4,3,11,0.999995,3234,0.999996,36.0,0.999991
...,...,...,...,...,...,...,...
11930599,35790,13,0.822624,96594,0.588666,50.0,0.48425
11930624,35790,13,0.822624,102280,0.596065,34.0,0.490337
11930695,35790,13,0.822624,118250,0.589992,39.0,0.485342
12926587,31975,8,0.998603,95477,0.418382,39.0,0.417798


In [69]:
y_true = test_mask
y_true["is_fav"] = y_true["rating"].apply(lambda x: 1 if x>2.5 else 0)

In [70]:
def evaluate_model(y_true, y_pred):
    """
    y_true: dataframe: user_id, item_id, is_fav (1 true, 0 false)
    y_pred: dataframe: user_id, item_id
    return:
    precision@k, recall@k
    """
    y_pred2 = y_pred.copy()
    y_pred2["is_rec"] = 1
    y_true2 = y_true.copy()

    total1 = y_true2.merge(y_pred2, on=[userCol, itemCol], how = 'left')
    total1["rec_fav"] = total1["is_rec"].fillna(0) * total1["is_fav"]

    # Precision
    p = total1["rec_fav"].sum() / total1["is_rec"].sum()

    # Recall
    r = total1["rec_fav"].sum() / total1["is_fav"].sum()
    return p, r

In [71]:
precision, recall = evaluate_model(y_true, y_pred)

In [72]:
print("Precision :", precision)

Precision : 0.75


In [73]:
recall

1.4498230732576268e-06

# END HERE

In [74]:
1/0

ZeroDivisionError: ignored

In [None]:
def evaluate_rs(y_true, y_pred, is_return_df = False):
    """
    y_true: dataframe: user_id, item_id, y (rating normailised), only favorite item
    y_pred: dataframe: user_id, item_id (just top k item)
    return:
    precision@k, recall@k
    """
    total1 = y_true.merge(y_pred, on=[userCol, itemCol], how = 'outer', suffixes=('_t', '_p'))
    total1['is_pt'] = total1.apply(lambda x: 0 if (np.isnan(x["y"]) or np.isnan(x["rank"]) ) else 1,axis=1)
    total = total1.groupby(userCol).agg({
        "y":'count',
        "rank":'count',
        "is_pt": 'sum'
        })
    total.columns = ["true_num", "predict_num", "pred_true_num"]
    total["macro_p"] = total["pred_true_num"]/total["predict_num"]
    total["macro_r"] = total["pred_true_num"]/total["true_num"]

    total = total[total["predict_num"]>0]

    macro_p = total[total["predict_num"]>0]["macro_p"].mean()
    macro_r = total[total["true_num"]>0]["macro_r"].mean()

    micro_p = total["pred_true_num"].sum()/total["predict_num"].sum()
    micro_r = total["pred_true_num"].sum()/total["true_num"].sum()

    print("macro_p: ", macro_p, "; macro_r :", macro_r)
    print("micro_p: ", micro_p, "; micro_r :", micro_r)

    if is_return_df:
        return (macro_p, macro_r, micro_p, micro_r), total

    return macro_p, macro_r, micro_p, micro_r

In [None]:
%%time
eval_map = map_at_k(y_true, y_pred, col_user = userCol, col_item = itemCol ,col_prediction='rating', col_rating="rating", k=top_k)
eval_ndcg = ndcg_at_k(y_true, y_pred, col_user = userCol, col_item = itemCol ,col_prediction='rating', col_rating="rating", k=top_k)
eval_precision = precision_at_k(y_true, y_pred, col_user = userCol, col_item = itemCol ,col_prediction='rating', col_rating="rating", k=top_k)
eval_recall = recall_at_k(y_true, y_pred, col_user = userCol, col_item = itemCol, col_prediction='rating', col_rating="rating", k=top_k)

print('K = %f' % top_k)
print(
    "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

In [None]:
%%time
eval_metrics, eval_df = evaluate_rs(y_true, y_pred, is_return_df= True)