<a href="https://colab.research.google.com/github/shellylin100/implement-Implementing-a-Collaborative-Filtering-Movie-Recommender-from-Implicit-Feedback/blob/main/implement_Implementing_a_Collaborative_Filtering_Movie_Recommender_from_Implicit_Feedback.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://medium.com/analytics-vidhya/implementation-of-a-movies-recommender-from-implicit-feedback-6a810de173ac

https://grouplens.org/datasets/movielens/1m/

In [13]:
#! pip install implicit

In [14]:
import os
import sys
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, save_npz, load_npz, vstack, hstack, lil_matrix
import implicit
import pickle
from implicit.evaluation import train_test_split, precision_at_k, mean_average_precision_at_k

In [15]:
def load_data():
    '''load the MovieLens 1m dataset in a Pandas dataframe'''
    ratings = pd.read_csv('ratings.dat', delimiter='::', header=None, 
        names=['user_id', 'movie_id', 'rating', 'timestamp'], 
        usecols=['user_id', 'movie_id', 'rating'], engine='python')

    return ratings

In [16]:
ratings = load_data()

In [17]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
132964,858,3099,3
132965,858,968,5
132966,858,1641,5
132967,858,2453,5


https://docs.scipy.org/doc/scipy/reference/sparse.html

csr_matrix: Compressed Sparse Row format

The Scipy sparse matrix format is csr_matrix((data), (rows, cols)).

In [18]:
def sparse_matrices(df):
    '''creates the sparse user-item and item-user matrices'''

    # using a scalar value (40) to convert ratings from a scale (1-5) to a like/click/view (1)
    alpha = 40

    sparse_user_item = csr_matrix( ([alpha]*len(df['movie_id']), (df['user_id'], df['movie_id']) )) # SVD
    # transposing the item-user matrix to create a user-item matrix
    sparse_item_user = sparse_user_item.T.tocsr() #transpose it
    # save the matrices for recalculating user on the fly 
    save_npz("sparse_user_item.npz", sparse_user_item)
    save_npz("sparse_item_user.npz", sparse_item_user)

    return sparse_user_item, sparse_item_user

In [19]:
ratings
# 6041 user
# 3953 movies
# 1000209 ratings

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5
...,...,...,...
132964,858,3099,3
132965,858,968,5
132966,858,1641,5
132967,858,2453,5


In [20]:
sparse_matrices(ratings)

(<859x3953 sparse matrix of type '<class 'numpy.longlong'>'
 	with 132969 stored elements in Compressed Sparse Row format>,
 <3953x859 sparse matrix of type '<class 'numpy.longlong'>'
 	with 132969 stored elements in Compressed Sparse Row format>)

In [21]:
# vars()

In [22]:
__name__

#__main__ 是預設名稱沒有意義

'__main__'

In [23]:
a, b = sparse_matrices(ratings)

In [24]:
# sparse_user_item table
a.toarray()

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 40,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0, 40, 40],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 40,  0, ...,  0,  0,  0]], dtype=int64)

In [25]:
# sparse_item_user
b.toarray()

array([[ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 40,  0, ...,  0,  0, 40],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ..., 40,  0,  0],
       [ 0,  0,  0, ..., 40,  0,  0]], dtype=int64)

In [26]:
df = pd.read_csv('movies.dat', delimiter='::', header=None, names=['movie_id', 'title', 'genre'], engine='python')

In [27]:
df.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [28]:
[df[df['movie_id'] == i].to_dict('records')[0] for i in ratings['movie_id'][:1]]

[{'genre': 'Drama',
  'movie_id': 1193,
  'title': "One Flew Over the Cuckoo's Nest (1975)"}]

In [29]:
df[df['movie_id'] == 1].to_dict('records')[0]

{'genre': "Animation|Children's|Comedy",
 'movie_id': 1,
 'title': 'Toy Story (1995)'}

In [30]:
df[df['movie_id'].isin(ratings['movie_id'][:10])].to_dict('records') #records沒有index

[{'genre': "Animation|Children's|Musical",
  'movie_id': 594,
  'title': 'Snow White and the Seven Dwarfs (1937)'},
 {'genre': "Animation|Children's|Musical",
  'movie_id': 661,
  'title': 'James and the Giant Peach (1996)'},
 {'genre': 'Musical|Romance', 'movie_id': 914, 'title': 'My Fair Lady (1964)'},
 {'genre': "Adventure|Children's|Drama|Musical",
  'movie_id': 919,
  'title': 'Wizard of Oz, The (1939)'},
 {'genre': 'Drama',
  'movie_id': 1193,
  'title': "One Flew Over the Cuckoo's Nest (1975)"},
 {'genre': 'Action|Adventure|Comedy|Romance',
  'movie_id': 1197,
  'title': 'Princess Bride, The (1987)'},
 {'genre': 'Action|Adventure|Drama',
  'movie_id': 1287,
  'title': 'Ben-Hur (1959)'},
 {'genre': "Animation|Children's|Comedy",
  'movie_id': 2355,
  'title': "Bug's Life, A (1998)"},
 {'genre': 'Comedy|Drama',
  'movie_id': 2804,
  'title': 'Christmas Story, A (1983)'},
 {'genre': 'Drama', 'movie_id': 3408, 'title': 'Erin Brockovich (2000)'}]

In [31]:
def map_movies(movie_ids):
    '''takes a list of movie_ids and returns a list of dictionaries with movies information'''
    df = pd.read_csv('movies.dat', delimiter='::', header=None, 
        names=['movie_id', 'title', 'genre'], engine='python')

    # add years to a new column 'year' and remove them from the movie title
    df['year'] = df['title'].str[-5:-1]
    df['title'] = df['title'].str[:-6]

    # creates an ordered list of dictionaries with the movie information for all movie_ids 
    mapped_movies = [df[df['movie_id'] == i].to_dict('records')[0] for i in movie_ids]

    return mapped_movies


def map_users(user_ids):
  '''takes a list of user_ids and returns a list of dictionaries with user information'''
  df = pd.read_csv('users.dat', delimiter='::', header=None, 
      names=['user_id', 'gender', 'agerange', 'occupation', 'timestamp'], engine='python')
  df = df.drop(['timestamp'], axis=1)

  mapped_users = [df[df['user_id'] == i].to_dict('records')[0] for i in user_ids]

  return mapped_users

In [32]:
def most_similar_items(item_id, n_similar=10):
  '''computes the most similar items'''
  with open('model.sav', 'rb') as pickle_in:
      model = pickle.load(pickle_in)

  similar, _ = zip(*model.similar_items(item_id, n_similar)[1:])

  return map_movies(similar)

In [40]:
def most_similar_users(user_id, n_similar=10):
  '''computes the most similar users'''
  sparse_user_item = load_npz("sparse_user_item.npz")

  with open('model.sav', 'rb') as pickle_in:
      model = pickle.load(pickle_in)

  # similar users gives back [(users, scores)]
  # we want just the users and not the first one, because that is the same as the original user
  similar, _ = zip(*model.similar_users(user_id, n_similar)[1:])

  # orginal users items
  original_user_items = list(sparse_user_item[user_id].indices)

  # # this maps back user_ids to their information, which is useful for visualisation
  similar_users_info = map_users(similar)
  # # now we want to add the items that a similar used has rated
  for user_info in mapped:
      # we create a list of items that correspond to the simillar user ids
      # then compare that in a set operation to the original user items
      # as a last step we add it as a key to the user information dictionary
      user_info['items'] = set(list(sparse_user_item[user_info['user_id']].indices)) & set(original_user_items)

  return similar_users_info

In [36]:
def model():
  '''computes p@k and map@k evaluation metrics and saves model'''
  sparse_item_user = load_npz("sparse_item_user.npz")

  train, test = train_test_split(sparse_item_user, train_percentage=0.8)

  model = implicit.als.AlternatingLeastSquares(factors=100, 
      regularization=0.1, iterations=20, calculate_training_loss=False, use_gpu=False) # don't use gpu
  model.fit(train)

  with open('model.sav', 'wb') as pickle_out:
      pickle.dump(model, pickle_out)

  train, test = train.T.tocsr(), test.T.tocsr()
  
  p_at_k = precision_at_k(model, train_user_items=train, 
      test_user_items=test, K=10)
  m_at_k = mean_average_precision_at_k(model, train, test, K=10)

  return p_at_k, m_at_k

In [37]:
p_at_k, m_at_k = model()



  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

In [42]:
most_similar_items(1)

[{'genre': 'Comedy|Romance',
  'movie_id': 1265,
  'title': 'Groundhog Day ',
  'year': '1993'},
 {'genre': 'Comedy|Drama',
  'movie_id': 1784,
  'title': 'As Good As It Gets ',
  'year': '1997'},
 {'genre': 'Comedy|Romance',
  'movie_id': 2396,
  'title': 'Shakespeare in Love ',
  'year': '1998'},
 {'genre': "Animation|Children's|Comedy",
  'movie_id': 3114,
  'title': 'Toy Story 2 ',
  'year': '1999'},
 {'genre': 'Action|Adventure|Sci-Fi',
  'movie_id': 2094,
  'title': 'Rocketeer, The ',
  'year': '1991'},
 {'genre': 'Comedy',
  'movie_id': 3253,
  'title': "Wayne's World ",
  'year': '1992'},
 {'genre': 'Comedy',
  'movie_id': 1923,
  'title': "There's Something About Mary ",
  'year': '1998'},
 {'genre': 'Comedy',
  'movie_id': 1136,
  'title': 'Monty Python and the Holy Grail ',
  'year': '1974'},
 {'genre': "Animation|Children's|Musical|Romance",
  'movie_id': 48,
  'title': 'Pocahontas ',
  'year': '1995'}]

In [43]:
def recommend(user_id):
  '''recommend N items to user'''
  sparse_user_item = load_npz("sparse_user_item.npz")

  with open('model.sav', 'rb') as pickle_in:
      model = pickle.load(pickle_in)

  recommended, _ =  zip(*model.recommend(user_id, sparse_user_item))

  return recommended, map_movies(recommended)

In [46]:
recommend(1)

((364, 2078, 2858, 1198, 318, 34, 1282, 1259, 1688, 593),
 [{'genre': "Animation|Children's|Musical",
   'movie_id': 364,
   'title': 'Lion King, The ',
   'year': '1994'},
  {'genre': "Animation|Children's|Comedy|Musical",
   'movie_id': 2078,
   'title': 'Jungle Book, The ',
   'year': '1967'},
  {'genre': 'Comedy|Drama',
   'movie_id': 2858,
   'title': 'American Beauty ',
   'year': '1999'},
  {'genre': 'Action|Adventure',
   'movie_id': 1198,
   'title': 'Raiders of the Lost Ark ',
   'year': '1981'},
  {'genre': 'Drama',
   'movie_id': 318,
   'title': 'Shawshank Redemption, The ',
   'year': '1994'},
  {'genre': "Children's|Comedy|Drama",
   'movie_id': 34,
   'title': 'Babe ',
   'year': '1995'},
  {'genre': "Animation|Children's|Musical",
   'movie_id': 1282,
   'title': 'Fantasia ',
   'year': '1940'},
  {'genre': 'Adventure|Comedy|Drama',
   'movie_id': 1259,
   'title': 'Stand by Me ',
   'year': '1986'},
  {'genre': "Animation|Children's|Musical",
   'movie_id': 1688,
   '

In [52]:
def recommend_all_users():
  '''recommend N items to all users'''
  sparse_user_item = load_npz("sparse_user_item.npz")

  with open('model.sav', 'rb') as pickle_in:
      model = pickle.load(pickle_in)

  # numpy array with N recommendations for each user
  # remove first array, because those are the columns
  all_recommended = model.recommend_all(user_items=sparse_user_item, N=10, 
      recalculate_user=False, filter_already_liked_items=True)[1:]

  # create a new Pandas Dataframe with user_id, 10 recommendations, for all users
  df = pd.read_csv('users.dat', delimiter='::', header=None, 
      names=['user_id', 'gender', 'agerange', 'occupation', 'timestamp'], engine='python')
  df = df.drop(['gender', 'agerange', 'occupation', 'timestamp'], axis=1)
  df[['rec1', 'rec2', 'rec3', 'rec4', 'rec5', 'rec6', 'rec7', 'rec8', 'rec9', 'rec10']] = pd.DataFrame(all_recommended)
  df.to_pickle("all_recommended.pkl")

  '''melt dataframe into SQL format for Django model
  melted = df.melt(id_vars=['user_id'], var_name='order', value_name='recommendations',
      value_vars=['rec1', 'rec2', 'rec3', 'rec4', 'rec5', 'rec6', 'rec7', 'rec8', 'rec9', 'rec10'])
  melted['order'] = melted.order.str[3:]
  print(melted.sort_values(by=['user_id', 'order']))
  melted.to_pickle('all_recommended_melted.pkl')
  '''

  return df

In [53]:
recommendation = recommend_all_users()

  0%|          | 0/859 [00:00<?, ?it/s]

In [55]:
recommendation[0:20]

Unnamed: 0,user_id,rec1,rec2,rec3,rec4,rec5,rec6,rec7,rec8,rec9,rec10
0,1,364.0,2078.0,2858.0,1198.0,318.0,34.0,1282.0,1259.0,1688.0,593.0
1,2,1580.0,1721.0,1233.0,3360.0,377.0,1676.0,3178.0,1254.0,474.0,316.0
2,3,2628.0,3175.0,2716.0,2174.0,1552.0,3108.0,1220.0,2791.0,380.0,1246.0
3,4,2571.0,1200.0,589.0,110.0,1291.0,858.0,457.0,1610.0,1374.0,2858.0
4,5,1179.0,235.0,337.0,2396.0,2686.0,858.0,1249.0,1188.0,2336.0,1193.0
5,6,357.0,2087.0,661.0,919.0,904.0,1196.0,480.0,1294.0,2080.0,2145.0
6,7,1210.0,2858.0,260.0,1198.0,2058.0,2628.0,3555.0,1527.0,21.0,1370.0
7,8,802.0,509.0,1569.0,590.0,593.0,2628.0,1729.0,1884.0,3578.0,1268.0
8,9,3578.0,2997.0,380.0,32.0,2321.0,1517.0,589.0,440.0,3481.0,2542.0
9,10,2406.0,3033.0,471.0,17.0,1036.0,3396.0,910.0,141.0,2463.0,1569.0


In [56]:
def recalculate_user(user_ratings):
  '''adds new user and its liked items to sparse matrix and returns recalculated recommendations'''

  alpha = 40
  m = load_npz('sparse_user_item.npz')
  n_users, n_movies = m.shape

  ratings = [alpha for i in range(len(user_ratings))]

  m.data = np.hstack((m.data, ratings))
  m.indices = np.hstack((m.indices, user_ratings))
  m.indptr = np.hstack((m.indptr, len(m.data)))
  m._shape = (n_users+1, n_movies)

  # recommend N items to new user
  with open('model.sav', 'rb') as pickle_in:
      model = pickle.load(pickle_in)
  recommended, _ =  zip(*model.recommend(n_users, m, recalculate_user=True))
  
  return recommended, map_movies(recommended)