In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv

In [None]:
with open('preprocessed_movie.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    df_movies = pd.DataFrame(reader)
df_ratings = pd.read_csv('rating.csv')
df_movies['movieId'] = df_movies['movieId'].astype('int64')

In [93]:
def parse_embeddings(embedding_str):
    embedding_str = str(embedding_str).replace('\n', '')
    embedding_str = embedding_str.strip('[]')
    return np.array([float(x) for x in embedding_str.split() if x])

df_movies['embedding'] = df_movies['embedding'].apply(parse_embeddings)

In [94]:
df_ratings['is_relevant'] = df_ratings['rating'].apply(lambda x: int(x >= 3))

Dataset with reduced dimensionality

In [95]:
embeddings_dict = {}
dim = 409

for _, row in df_movies.iterrows():
    emb = row['embedding']
    if emb.shape == (dim,):
      embeddings_dict[row['movieId']] = emb
    else:
      print(f"Improper dimesionality: {row['movieId']}: {emb.shape}")
      embeddings_dict[row['movieId']] = np.zeros(dim)

Filtering the dataset according to amount of users' reviews by 99.99 percentile

In [96]:
user_review_counts = df_ratings.groupby('userId')['rating'].count()

threshold = user_review_counts.quantile(0.9999)
print(f'Threshold for the amount of user\'s reviews: {threshold}')

filtered_users = user_review_counts[user_review_counts >= threshold].index

df_ratings = df_ratings[df_ratings['userId'].isin(filtered_users)].drop(['timestamp'], axis=1)

Threshold for the amount of user's reviews: 4253.794399996928


About 2.000.000 -> 78.086 entries

In [97]:
df_ratings

Unnamed: 0,userId,movieId,rating,is_relevant
1225775,8405,1,5.0,1
1225776,8405,2,3.0,1
1225777,8405,3,2.5,0
1225778,8405,5,1.5,0
1225779,8405,6,4.0,1
...,...,...,...,...
19068382,131904,124292,3.0,1
19068383,131904,124867,3.5,1
19068384,131904,128488,1.5,0
19068385,131904,129340,3.5,1


In [98]:
stats = {}
for user_id, group in df_ratings.groupby('userId'):
    relevant_movies = group[group['is_relevant'] == 1]['movieId'].tolist()
    irrelevant_movies = group[group['is_relevant'] == 0]['movieId'].tolist()
    stats[user_id] = (relevant_movies, irrelevant_movies)

train = {}
test = {}
for el in stats.items():
  user_id = el[0]
  relevant = el[1][0]
  irrelevant = el[1][1]

  # 50% for train set and 50% for test set
  train_relevant = np.random.choice(relevant, int(len(relevant) * 0.5)).tolist()
  train_irrelevant = np.random.choice(irrelevant, int(len(irrelevant) * 0.5)).tolist()
  test_relevant = list(set(relevant).difference(set(train_relevant)))
  test_irrelevant = list(set(irrelevant).difference(set(train_irrelevant)))

  train[user_id] = (train_relevant, train_irrelevant)
  test[user_id] = (test_relevant, test_irrelevant)


In [99]:
profiles = {}

def build_profile(lambda_param):
  ratings_dict = {}
  for _, row in df_ratings.iterrows():
      ratings_dict[(row['userId'], row['movieId'])] = row['rating']

  embeddings_dict = {}
  for _, row in df_movies.iterrows():
      embeddings_dict[row['movieId']] = row['embedding']

  for user_id, (relevant, irrelevant) in train.items():
      good_profile = np.zeros(dim)
      good_weights = 0
      bad_profile = np.zeros(dim)
      bad_weights = 0

      for movie_id in relevant:
          try:
              rating = ratings_dict[(user_id, movie_id)]
              emb = embeddings_dict[movie_id]

              weight = rating - 3
              good_weights += weight
              if emb.shape[0] == 0:
                continue
              good_profile += emb * weight
          except KeyError:
              continue

      for movie_id in irrelevant:
          try:
              rating = ratings_dict[(user_id, movie_id)]
              emb = embeddings_dict[movie_id]

              weight = 3 - rating
              bad_weights += weight
              if emb.shape[0] == 0:
                continue
              bad_profile += emb * weight
          except KeyError:
              continue

      good_profile = good_profile / good_weights if good_weights > 0 else good_profile
      bad_profile = bad_profile / bad_weights if bad_weights > 0 else bad_profile

      profiles[user_id] = good_profile - lambda_param * bad_profile