# MovieLense

In [26]:
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd

In [27]:
def get_data(ds_name = 'movielens/100k-ratings'):
  def genre_id_to_text(genre_ids):
    return [genres[id] for id in genre_ids]

  ds=  tfds.load(ds_name, split='all')
  df = tfds.as_dataframe(ds)

  for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].apply(
            lambda x: x.decode('utf-8') if isinstance(x, bytes) else x
        )

  genres = ['Action' ,'Adventure' ,'Animation',
        'Children' ,'Comedy' ,'Crime' ,'Documentary' ,'Drama' ,'Fantasy',
        'Film-Noir' ,'Horror' ,'IMAX' ,'Musical' ,'Mystery' ,'Romance' ,'Sci-Fi',
        'Thriller' ,'War' ,'Western', 'unknown']
  df['genres_id'] = df['movie_genres']
  df['movie_genres'] = df['movie_genres'].apply(genre_id_to_text)

  return df


In [28]:
data = get_data()

2024-11-17 15:18:34.820690: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [29]:
data.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code,genres_id
0,45.0,[Drama],357,One Flew Over the Cuckoo's Nest (1975),46.0,879024327,True,138,4,doctor,4.0,53211,[7]
1,25.0,"[Comedy, Romance]",709,Strictly Ballroom (1992),32.0,875654590,True,92,5,entertainment,2.0,80525,"[4, 14]"
2,18.0,[Comedy],412,"Very Brady Sequel, A (1996)",24.0,882075110,True,301,17,student,4.0,55439,[4]
3,50.0,"[Crime, Drama]",56,Pulp Fiction (1994),50.0,883326919,True,60,4,healthcare,4.0,6472,"[5, 7]"
4,50.0,"[Horror, Thriller]",895,Scream 2 (1997),55.0,891409199,True,197,18,technician,3.0,75094,"[10, 16]"


In [30]:
df = pd.DataFrame(data, columns=['movie_id','user_id','user_rating'])
negative_ratings = df[ df['user_rating'] < 4 ].index 

df.drop(negative_ratings, inplace = True) 

In [31]:
df = df.drop(columns=['user_rating'])

In [32]:
from sklearn.model_selection import train_test_split


In [33]:
config = {
    #embedding computation
    'cleora_n_iter': 5,
    'cleora_dim': 1024,
    
    #dataset preparation
    'train_test_split': 0.2,
    
    'batch_size': 256,
    'test_batch_size': 1000,
    'epochs': [10],
    'alpha': [1e-4],
}

In [34]:
train, test = train_test_split(df, test_size=config['train_test_split'])

In [35]:
train.shape, test.shape

((44300, 2), (11075, 2))

In [36]:
len(train.groupby('user_id'))

942

In [37]:
grouped_train = train.groupby('user_id')
for n, (name, group) in enumerate(grouped_train):
    if name=='1':
        continue
    print(group)
    break


      movie_id user_id
83324      664      10
97779      435      10
58473      617      10
51550      484      10
62751      519      10
...        ...     ...
14375      192      10
32654      371      10
51593      651      10
92579      133      10
606        182      10

[140 rows x 2 columns]


In [38]:
ml_cleora_input_clique_filename = "../movielens/cleora_input_clique.txt"
ml_cleora_input_star_filename = "../movielens/cleora_input_star.txt"
ml_lp_train_filename = "../movielens/lp_train.txt"
ml_lp_test_filename = "../movielens/lp_test.txt"
output_dir = '../output'

In [None]:
with open(ml_cleora_input_clique_filename, "w") as f_cleora_clique, open(
    ml_cleora_input_star_filename, "w"
) as f_cleora_star, open(ml_lp_train_filename, "w") as f_train:
    grouped_train = train.groupby("user_id")
    for n, (name, group) in enumerate(grouped_train):
        group_list = group["movie_id"].tolist()
        group_elems = list(map(str, group_list))
        f_cleora_clique.write("{} {}\n".format(name, " ".join(group_elems)))
        f_cleora_star.write("{}\t{}\n".format(n, name))
        for elem in group_elems:
            f_train.write("{}\t{}\n".format(name, elem))
            f_cleora_star.write("{}\t{}\n".format(n, elem))

In [40]:
with open(ml_lp_test_filename, "w") as f_test:
    grouped_test = test.groupby('user_id')
    for name, group in grouped_test:
        group_list = group['movie_id'].tolist()
        group_elems = list(map(str, group_list))
        for elem in group_elems:
            f_test.write("{}\t{}\n".format(name, elem))

In [41]:
CLEORA_BINARY = '../src/cleora-v1.2.3-x86_64-unknown-linux-gnu'

In [42]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from tqdm import tqdm
import pandas as pd
from collections import Counter
import random
import os

In [43]:
import subprocess


def columns2output_filename(output_dir, columns):
    columns_split = columns.split()
    if len(columns_split) == 1 and 'reflexive' in columns:
        column_name = columns.split('::')[-1]
        return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')

    column_names = [i.split('::')[-1] for i in columns_split]
    return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')


def train_cleora(dim, n_iter, columns, input_filename, output_dir):
    command = [CLEORA_BINARY,
                '--columns', columns,
                '--dimension', str(dim), 
                '-n', str(n_iter), 
                '--input', input_filename, 
                '-o', output_dir]
    subprocess.run(command, check=True)
    return columns2output_filename(output_dir, columns)

In [44]:
%%time
cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "transient::cluster_id StarNode", ml_cleora_input_star_filename, output_dir)

[0m[38;5;8m[[0m2024-11-17T14:18:38Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 1024,
    max_number_of_iteration: 5,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "../movielens/cleora_input_star.txt",
    ],
    file_type: Tsv,
    output_dir: Some(
        "../output",
    ),
    output_format: TextFile,
    relation_name: "emb",
    columns: [
        Column {
            name: "cluster_id",
            transient: true,
            complex: false,
            reflexive: false,
            ignored: false,
        },
        Column {
            name: "StarNode",
            transient: false,
            complex: false,
            reflexive: false,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2024-11-17T14:18:38Z [0m[32mINFO [0m cleora[0m[38;5;8m

[0m[38;5;8m[[0m2024-11-17T14:18:38Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 1. Dims: 1024, entities: 2382, num data points: 90388.
[0m[38;5;8m[[0m2024-11-17T14:18:38Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 2. Dims: 1024, entities: 2382, num data points: 90388.
[0m[38;5;8m[[0m2024-11-17T14:18:38Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 3. Dims: 1024, entities: 2382, num data points: 90388.
[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done iter: 4. Dims: 1024, entities: 2382, num data points: 90388.
[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done propagating.
[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Start saving embeddings.


CPU times: user 4.9 ms, sys: 4.12 ms, total: 9.02 ms
Wall time: 564 ms


[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Done saving embeddings.
[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora::embedding[0m[38;5;8m][0m Finalizing embeddings calculations!
[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Finished in 0 sec


In [45]:
%%time
cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], "complex::reflexive::CliqueNode", ml_cleora_input_clique_filename, output_dir)

CPU times: user 8.13 ms, sys: 1.02 ms, total: 9.15 ms
Wall time: 20 ms


[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Reading args...
[src/main.rs:222] &config = Configuration {
    produce_entity_occurrence_count: true,
    embeddings_dimension: 1024,
    max_number_of_iteration: 5,
    seed: None,
    prepend_field: false,
    log_every_n: 10000,
    in_memory_embedding_calculation: true,
    input: [
        "../movielens/cleora_input_clique.txt",
    ],
    file_type: Tsv,
    output_dir: Some(
        "../output",
    ),
    output_format: TextFile,
    relation_name: "emb",
    columns: [
        Column {
            name: "CliqueNode",
            transient: false,
            complex: true,
            reflexive: true,
            ignored: false,
        },
    ],
}
[0m[38;5;8m[[0m2024-11-17T14:18:39Z [0m[32mINFO [0m cleora[0m[38;5;8m][0m Starting calculation...
[src/pipeline.rs:25] &sparse_matrices = [
    SparseMatrix {
        col_a_id: 0,
        col_a_name: "CliqueNode",
        col_b_id: 1,
      

In [46]:
def read_embeddings(input_file):
    df_full = pd.read_csv(input_file, delimiter = " ", skiprows=[0], header=None, 
                     index_col=0)
    df_full = df_full.drop([1], axis=1)  
    return df_full

In [47]:
def read_train_test(embeddings):
    valid_idx = embeddings.index.to_numpy()
    train = np.loadtxt(ml_lp_train_filename, delimiter="\t", dtype=np.int32)
    test = np.loadtxt(ml_lp_test_filename, delimiter="\t", dtype=np.int32)
    
    #valid pairs of nodes
    train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]
    test = test[np.isin(test[:,0], valid_idx) & np.isin(test[:,1], valid_idx)]
    
    #negatives for testset: top 100 most common nodes
    all_idx = train.flatten()
    ctr = Counter(all_idx)
    negatives = ctr.most_common(100)
    negatives = [ n[0] for n in negatives ]
    
    adjacency_dict = dict()
    for inp, out in np.vstack([test, train]):
        if inp not in adjacency_dict:
            adjacency_dict[inp] = set()
        adjacency_dict[inp].add(out)

    return train, test, negatives, adjacency_dict, valid_idx

In [48]:
batch_size = config['batch_size']
test_batch_size = config['test_batch_size']

In [49]:
for algo in [cleora_output_star_filename, cleora_output_clique_filename]:
    embeddings = read_embeddings(algo)
    train_1, test_1, negatives, adjacency_dict, valid_idx = read_train_test(embeddings)
    #for faster operation, draw only 1000 test examples
    test_ex = random.sample(list(test_1), 1000)
    
    #these are the 10.000 most common nodes selected as negatives for each valid testing node pair
    df_neg = embeddings.loc[negatives]
    neg_ids = set(df_neg.index)

    epoch = max(config['epochs'])
    for a in config['alpha']:
        #create a binary classifier outputting whether a node pair represents a valid edge (1) or not a valid edge (0)
        clf = SGDClassifier(random_state=0, loss='log_loss', alpha=a)
        for e in range(0, epoch):
            np.random.shuffle(train_1)
            
            for idx in tqdm(range(0,train_1.shape[0],batch_size)):
                #ones = real pairs of nodes
                #zeros = fake pairs of nodesF
                ones=train_1[idx:min(idx+batch_size,train_1.shape[0]),:]
                
                ones_emb_in = embeddings.loc[ones[:,0]].to_numpy()
                ones_emb_out = embeddings.loc[ones[:,1]].to_numpy()
                #Hadamard
                ones = np.multiply(ones_emb_in,ones_emb_out)
                
                id_train_0_in = np.random.choice(valid_idx, size=len(ones), replace=True)
                id_train_0_out = np.random.choice(valid_idx, size=len(ones), replace=True)
    
                zeros_emb_in = embeddings.loc[id_train_0_in].to_numpy()
                zeros_emb_out = embeddings.loc[id_train_0_out].to_numpy()
                #Hadamard
                zeros = np.multiply(zeros_emb_in, zeros_emb_out)
    
                x_train = np.vstack([ones, zeros])
                y_train = [1]*len(ones) + [0]*len(ones)

                clf.partial_fit(x_train, y_train, classes=[0,1])

            if e+1 in config['epochs']:
                mrr = 0.0
                hr = 0.0
                for n, ex in enumerate(test_ex):
                    l = ex[0]
                    r = ex[1]

                    emb_l = embeddings.loc[l].to_numpy().reshape([1, -1])
                    emb_r = np.vstack((df_neg.to_numpy(), embeddings.loc[r].to_numpy()))
        
                    full_ex = np.hstack([np.repeat(emb_l, len(emb_r), axis=0), emb_r])
                    hadamard = np.multiply(emb_l, emb_r)
                    preds = clf.predict_proba(hadamard)[:,1]
                    preds = np.array(preds)

                    #do not punish for high scores of items from trainset and others from testset
                    forbidden_ex = adjacency_dict[l]
                    df_mask = [0 if (elem in forbidden_ex) else 1 for elem in neg_ids]
                    #last elem is always valid
                    df_mask.append(1)
                    preds *= df_mask
            
                    ranking = (-preds).argsort()
                    rank = np.isin(ranking, 100).nonzero()[0][0]+1
                    mrr += 1/rank
                    hr += (rank <= 10)
                    
                    if (n+1)%100 == 0:
                        print('mrr ', mrr/(n+1), ' hr@10 ', hr/(n+1))

                print('algo: {} epochs: {} lr: {}, mrr: {}, hr@10: {}'.format(algo, str(e+1), a, mrr/len(test_ex), hr/len(test_ex)))

100%|██████████| 174/174 [00:04<00:00, 35.06it/s]
100%|██████████| 174/174 [00:04<00:00, 36.21it/s]
100%|██████████| 174/174 [00:04<00:00, 38.45it/s]
100%|██████████| 174/174 [00:04<00:00, 37.81it/s]
100%|██████████| 174/174 [00:04<00:00, 35.50it/s]
100%|██████████| 174/174 [00:04<00:00, 35.90it/s]
100%|██████████| 174/174 [00:04<00:00, 35.75it/s]
100%|██████████| 174/174 [00:04<00:00, 34.94it/s]
100%|██████████| 174/174 [00:04<00:00, 35.19it/s]
100%|██████████| 174/174 [00:04<00:00, 35.82it/s]


mrr  0.12701636835752506  hr@10  0.19
mrr  0.1262138867292136  hr@10  0.18
mrr  0.11318428150756174  hr@10  0.16333333333333333
mrr  0.11941422410424049  hr@10  0.1775
mrr  0.10413082097642093  hr@10  0.158
mrr  0.10915006535860101  hr@10  0.16333333333333333
mrr  0.10808217234436701  hr@10  0.16428571428571428
mrr  0.1035582738313365  hr@10  0.15875
mrr  0.10178279588794283  hr@10  0.15555555555555556
mrr  0.10646457830168778  hr@10  0.162
algo: ../output/emb__cluster_id__StarNode.out epochs: 10 lr: 0.0001, mrr: 0.10646457830168778, hr@10: 0.162


EmptyDataError: No columns to parse from file

In [None]:
test.shape

(11075, 2)