In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import pandas as pd
from collections import Counter

In [None]:
movies = pd.read_csv("data/ml-latest/movies.csv")
ratings = pd.read_csv("data/ml-latest/ratings.csv")

In [68]:
movieidx = movies.merge(ratings[['movieId', 'rating']].groupby('movieId').aggregate('mean'), on='movieId').set_index('movieId')
titletomovie = movies[['movieId', 'title']].set_index('title').to_dict()['movieId']
genres = movies[['movieId', 'genres']].set_index('movieId').to_dict()['genres']

In [3]:
movies.columns, ratings.columns

(Index(['movieId', 'title', 'genres'], dtype='object'),
 Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object'))

In [4]:
moviesByUser = ratings.groupby("userId")["movieId"].apply(list)

In [6]:
from gensim.models import Word2Vec

model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007, min_count=50,
                 seed = 14, workers=20)
model.build_vocab(moviesByUser, progress_per=200)
model.train(moviesByUser, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)
print(model)

2023-10-05 00:27:01,759 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=100, alpha=0.03>', 'datetime': '2023-10-05T00:27:01.759545', 'gensim': '4.3.2', 'python': '3.8.18 (default, Aug 25 2023, 13:20:30) \n[GCC 11.4.0]', 'platform': 'Linux-6.2.0-33-generic-x86_64-with-glibc2.35', 'event': 'created'}
2023-10-05 00:27:01,760 : INFO : collecting all words and their counts
2023-10-05 00:27:01,761 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-10-05 00:27:01,764 : INFO : PROGRESS: at sentence #200, processed 20019 words, keeping 4842 word types
2023-10-05 00:27:01,767 : INFO : PROGRESS: at sentence #400, processed 37930 words, keeping 6572 word types
2023-10-05 00:27:01,771 : INFO : PROGRESS: at sentence #600, processed 61595 words, keeping 8559 word types
2023-10-05 00:27:01,774 : INFO : PROGRESS: at sentence #800, processed 75407 words, keeping 9091 word types
2023-10-05 00:27:01,779 : INFO : PROGRESS: at sentence #1000, processe

Word2Vec<vocab=16116, vector_size=100, alpha=0.03>


In [78]:
model.save("movielens.model")

2023-10-05 01:49:10,090 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'movielens.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-10-05T01:49:10.090350', 'gensim': '4.3.2', 'python': '3.8.18 (default, Aug 25 2023, 13:20:30) \n[GCC 11.4.0]', 'platform': 'Linux-6.2.0-33-generic-x86_64-with-glibc2.35', 'event': 'saving'}
2023-10-05 01:49:10,091 : INFO : not storing attribute cum_table
2023-10-05 01:49:10,105 : INFO : saved movielens.model


In [49]:
def similar_products(v, n = 6):
    # extract most similar products for the input vector
    ms = model.wv.most_similar(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        # pair = (products_dict[j[0]][0], j[1])
        new_ms.append(j[0])
        
    return new_ms 

def bought_along(product):
    others = [p for buys in moviesByUser if product in buys for p in buys]
    return dict(Counter(others))

def match(pid):
    return movies['title'][movies['title'].str.contains(pid)]

In [83]:
pid = "Transformers (2007)"
if type(pid) is not int:
    pid=titletomovie[pid]
# print(match(pid))
n=20
# bought_with = bought_along(pid)
# top_bought=sorted(bought_with, key=lambda k:bought_with[k], reverse=True)[:n]
recommend = similar_products(pid, n)
print(movieidx.loc[pid])
movieidx.loc[list(recommend)].sort_values("rating", ascending=False).reset_index(
    drop=True
).
# print(bought_with)

Unnamed: 0,title,genres,rating
0,"Bourne Ultimatum, The (2007)",Action|Crime|Thriller,3.908913
1,Stardust (2007),Adventure|Comedy|Fantasy|Romance,3.799042
2,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy|IMAX,3.762763
3,Mr. Brooks (2007),Crime|Drama|Thriller,3.640303
4,Sicko (2007),Documentary|Drama,3.600277
5,Rescue Dawn (2006),Action|Adventure|Drama|War,3.586145
6,"Kingdom, The (2007)",Action|Drama|Thriller,3.512522
7,Pirates of the Caribbean: At World's End (2007),Action|Adventure|Comedy|Fantasy,3.395484
8,"Simpsons Movie, The (2007)",Animation|Comedy,3.389664
9,"Librarian: Quest for the Spear, The (2004)",Action|Adventure|Comedy|Fantasy|Romance,3.306452
