In [5]:
import os
import pandas as pd
import torch
from tqdm.auto import tqdm
from MLP import MLP

In [6]:
dir = './data/'
df_ratings = pd.read_csv(dir + 'ratings.csv', usecols=['userId', 'movieId', 'rating'])
df_movies = pd.read_csv(dir + 'movies.csv', usecols=['movieId', 'title', 'genres'])

### Pre-trained model load

In [7]:
os.listdir('./model/')[2]

'MLP_layer_[100,50,20,10]_dropout_0.0_lr_0.001_epoch_20.pth'

In [8]:
MODEL_PATH = './model/'
MODEL_STATE_PATH = './model_state/'

# You can select any pre-trained model.
model = torch.load(MODEL_PATH + os.listdir(MODEL_PATH)[2], map_location=torch.device('cpu'))
model.load_state_dict(torch.load(MODEL_STATE_PATH + os.listdir(MODEL_STATE_PATH)[0], map_location=torch.device('cpu')))

<All keys matched successfully>

In [9]:
model

MLP(
  (user_embedding): Embedding(611, 50)
  (item_embedding): Embedding(193610, 50)
  (fc_layers): ModuleList(
    (0): Linear(in_features=100, out_features=50, bias=True)
    (1): Linear(in_features=50, out_features=20, bias=True)
    (2): Linear(in_features=20, out_features=10, bias=True)
  )
  (bn_layers): ModuleList(
    (0): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): BatchNorm1d(20, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (output_layer): Linear(in_features=10, out_features=1, bias=True)
)

### Will use item_embedding for recommendation

In [10]:
embedding = model.state_dict()['item_embedding.weight'].cpu()
embedding.size()

torch.Size([193610, 50])

### Preprocess for suitable recommendation format

In [11]:
# Since there is a difference between total # of movies and # of movies seen by users,
# remove them beforehand.
union_movies = pd.merge(pd.DataFrame(df_ratings['movieId'].unique(), columns=['movieId']), 
                        pd.DataFrame(df_movies['movieId']),
                        how='outer', indicator=True)

# Filter movies not rated by any users.
# [1076, 2939, 3338, 3456, 4194, 5721, 6668, 6849, 7020, 7792, 8765, 25855, 26085, 30892, 32160, 32371, 34482, 85565]
unseen_movieIds = union_movies[union_movies['_merge'] != 'both']['movieId'].values.tolist()

# Remove non-rated movies from df_movies.
df_movies = df_movies.drop(df_movies[df_movies['movieId'].isin(unseen_movieIds)].index)

# For comfortable indexing, add existing index as a new column (movieId_num).
df_movies = df_movies.reset_index(drop=True).reset_index().rename(columns = {'index' : 'movieId_num'})

# Merge df_ratings & df_movies on movieId.
df_sample = pd.merge(df_ratings, df_movies, on='movieId')

# Define dictionary to remember user's seen movie's movieId.
items = {}

# Loop over unique users
for user_id in df_ratings['userId'].unique():
    # Extract user's seen movies.
    user_ratings_sample = df_sample[df_sample['userId'] == user_id]
    
    # Add seen movies to items.
    # items are look like : {'userId' : 'seen_movies'}
    items[user_id] = ' '.join(user_ratings_sample['movieId_num'].astype('str'))

# Using items, make DataFrame in the form of 'userId' - 'seen_movies'.
# Here, 'seen_movies' mean actual index of movieId in df_movies, where 'reset_index' method was performed. 
total = pd.DataFrame.from_dict(items, orient='index')
total = total.reset_index()
total = total.rename(columns = {'index' : 'userId', 0 : 'seen_movies_trueID'})
total['seen_movies_trueID'] = total['seen_movies_trueID'].apply(lambda x: x.split())
total['seen_movies_trueID'] = total['seen_movies_trueID'].apply(lambda x: list(map(int, x))) # torch 입력을 위해 str을 int로 변환

total

Unnamed: 0,userId,seen_movies_trueID
0,1,"[0, 2, 5, 43, 46, 62, 89, 97, 124, 130, 136, 1..."
1,2,"[291, 2670, 277, 1283, 4607, 5294, 6236, 6298,..."
2,3,"[461, 973, 1189, 1492, 1552, 1566, 2761, 30, 5..."
3,4,"[43, 201, 224, 257, 384, 398, 485, 510, 520, 5..."
4,5,"[0, 46, 97, 257, 275, 307, 325, 398, 461, 508,..."
...,...,...
605,606,"[0, 43, 46, 62, 97, 130, 190, 197, 201, 224, 2..."
606,607,"[0, 97, 224, 257, 275, 367, 398, 418, 461, 485..."
607,608,"[0, 2, 43, 46, 62, 97, 136, 184, 190, 197, 224..."
608,609,"[0, 97, 197, 257, 314, 398, 418, 508, 509, 277..."


In [12]:
# For user id, return a list of topn recommended (most similar) movies.
def makeRecommendation(userId, topn):
    user_rating = total[total['userId'] == USER_ID]
    user_seen_movies = user_rating['seen_movies_trueID'].values[0]

    # 해당 사용자가 본 영화 목록 get
    # Get user's seen movies.
    user_movie_df = df_movies.loc[df_movies['movieId_num'].isin(user_seen_movies)]
    
    # For user's seen movies, make integrated embedding vector.
    # Embedding size is 'layer[0]/2'. (pre-trained model's embedding size is 50.)
    user_embedding = torch.zeros(50)
    for movie in user_seen_movies:
        user_embedding = torch.add(user_embedding, embedding[movie])
        user_embedding = user_embedding / len(user_seen_movies)

    # Calculate similarity between total item embedding and user's item embedding.
    cosine = torch.nn.CosineSimilarity(dim=0)
    similarity_list = []
    for i in tqdm(range(embedding.shape[0]), desc=f'Making Recommendation...'):
        output = cosine(user_embedding, embedding[i])
        similarity_list.append(output)
    similarity_list = np.array(similarity_list)
    
    # Obtained index corresponds to movieId_num in df_movies.
    # Get topn similarity list.
    topn_index = np.argpartition(similarity_list, -TOPN)[-TOPN:].tolist()
    topn_sim = similarity_list[topn_index]

    # Make recommendation as dataframe.
    recommend_df = df_movies.loc[df_movies['movieId'].isin(topn_index)]
    recommend_df = recommend_df.assign(similarity = topn_sim)
    recommend_df = recommend_df.drop(columns=['movieId_num']).reset_index(drop=True)
    recommend_df = recommend_df.sort_values(by=['similarity'], ascending=False)
    recommend_df.reset_index(drop=True, inplace=True)
    
    return recommend_df

# Get user's seen movies. (with genres)
def user_seen_movies(userId):
    user_rating = total[total['userId'] == userId]
    user_seen_movies = user_rating['seen_movies_trueID'].values[0]
    
    user_movie_df = df_movies.loc[df_movies['movieId_num'].isin(user_seen_movies)]
    
    # Shuffle seen_movies list randomly.
    user_movie_df = user_movie_df.sample(frac=1).reset_index(drop=True)
    user_movie_df = user_movie_df.drop(columns=['movieId_num']).reset_index(drop=True)
    
    # Get user's seen movies's genre as list.
    genre_list = pd.concat([user_movie_df, user_movie_df['genres'].str.get_dummies(sep='|')], axis=1)
    genre_list = genre_list.columns.tolist()
    genre_list = genre_list[3:]
    
    print(f"Genre of user {userId} seen movies are : " + ', '.join(genre_list))
    
    # Only show top 10 movies.
    # (Since user_movie_df is shuffled, result may be differ all times.)
    return user_movie_df.head(10)

In [13]:
USER_ID = 555
TOPN = 10

In [14]:
user_seen_movies(USER_ID)

Genre of user 555 seen movies are : Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western


Unnamed: 0,movieId,title,genres
0,2986,RoboCop 2 (1990),Action|Crime|Sci-Fi|Thriller
1,441,Dazed and Confused (1993),Comedy
2,2817,Aces: Iron Eagle III (1992),Action
3,3497,Max Dugan Returns (1983),Comedy
4,555,True Romance (1993),Crime|Thriller
5,24,Powder (1995),Drama|Sci-Fi
6,2641,Superman II (1980),Action|Sci-Fi
7,2450,Howard the Duck (1986),Adventure|Comedy|Sci-Fi
8,3051,Anywhere But Here (1999),Comedy|Drama
9,1665,Bean (1997),Comedy


In [15]:
makeRecommendation(USER_ID, TOPN)

Making Recommendation...:   0%|          | 0/193610 [00:00<?, ?it/s]

Unnamed: 0,movieId,title,genres,similarity
0,50842,"Boss of It All, The (Direktøren for det hele) ...",Comedy|Drama,1.0
1,86320,Melancholia (2011),Drama|Sci-Fi,0.999863
2,55061,Electroma (2006),Drama|Sci-Fi,0.999833
3,112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,0.999818
4,75816,Women in Trouble (2009),Comedy,0.999788
5,96488,Searching for Sugar Man (2012),Documentary,0.999683
6,6810,Sleeping with the Enemy (1991),Drama|Thriller,0.999675
7,2749,"Morning After, The (1986)",Drama|Mystery,0.999663
8,2445,At First Sight (1999),Drama,0.99966
9,1848,"Borrowers, The (1997)",Adventure|Children|Comedy|Fantasy,0.999638
