<a href="https://colab.research.google.com/github/puchake/recommender-systems/blob/master/recommender_workshop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@title install_modules

!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise (from surprise)
[?25l  Downloading https://files.pythonhosted.org/packages/4d/fc/cd4210b247d1dca421c25994740cbbf03c5e980e31881f10eaddf45fdab0/scikit-surprise-1.0.6.tar.gz (3.3MB)
[K    100% |████████████████████████████████| 3.3MB 7.9MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/ec/c0/55/3a28eab06b53c220015063ebbdb81213cd3dcbb72c088251ec
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.0.6 surprise-0.1


In [0]:
#@title download_data

!wget --output-document=ml-latest-small.zip http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!wget --output-document=ml-100k.zip http://files.grouplens.org/papers/ml-100k.zip
!wget --output-document=ml-1m.zip http://files.grouplens.org/papers/ml-1m.zip 
!unzip ml-latest-small.zip
!unzip ml-100k.zip
!unzip ml-1m.zip

In [0]:
#@title load_modules

import numpy as np
import surprise as sp
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression

In [0]:
#@title define_functions

def load_movielens_dataset(
  ratings_path, movies_path, ratings_cols, movies_cols, ratings_sep, movies_sep,
  skiprows, user_id_col, movie_id_col
):
  ratings = pd.read_csv(
      ratings_path, names=ratings_cols, delimiter=ratings_sep, 
      skiprows=skiprows, engine='python')
  movies = pd.read_csv(
      movies_path, names=movies_cols, delimiter=movies_sep, skiprows=skiprows,
      engine='python')
  n_users = ratings[user_id_col].nunique()
  n_movies = ratings[movie_id_col].nunique()
  n_ratings = len(ratings.index)
  return ratings, movies, n_users, n_movies, n_ratings


def l2_distance(point, points):
  return np.linalg.norm(points - point, axis=1)


def find_closest_points(point, points, n=10):
  distances = l2_distance(point, points)
  sorted_indices = np.argsort(distances)
  return sorted_indices[:n]


def calc_sparsity(n_ratings, n_users, n_items):
  return 1 - n_ratings / n_users / n_items


def find_ratings_decomposition(ratings, n_factors=100):
  reader = sp.Reader(rating_scale=(0.5, 5.0))
  data = sp.Dataset.load_from_df(ratings, reader=reader)
  algo = sp.SVD(n_factors=n_factors)
  #sp.model_selection.cross_validate(
  #    algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
  
  trainset = data.build_full_trainset()
  algo.fit(trainset)
  
  return algo.qi, algo.pu, algo


In [0]:
# Set of x_1, x_2 for example movies.
movies_features = np.array(
  [[1.0, 0.0],
   [0.9, 0.1],
   [0.1, 1.0],
   [0.0, 1.0]]
)

# Set of ratings for movies by users. Users are in columns. -1 means no rating.
ratings = np.array(
  [[ 5.0, -1.0,  1.0, -1.0],
   [-1.0,  4.5, -1.0,  2.0],
   [-1.0, -1.0,  4.5,  5.0],
   [ 1.0,  2.0,  5.0, -1.0]]
)

# Set of preferences of users for x_1 and x_2 feature of the movies.
users_preferences = []

for user_id in [0, 1, 2, 3]:
  valid_ratings_indices = np.where(ratings[:, user_id] != -1.0)
  valid_ratings = ratings[valid_ratings_indices, user_id][0]
  rated_movies = movies_features[valid_ratings_indices]
    
  preferencje = np.linalg.lstsq(rated_movies, valid_ratings, rcond=None)[0]
  users_preferences.append(preferencje)
  
user_preferences = np.array(users_preferences)
print("Found users' preferences:")
print(user_preferences, "\n")

predicted_ratings = np.dot(movies_features, user_preferences.T)
print("Predicted ratings:")
print(predicted_ratings, "\n")

comp_i = np.where(ratings != -1.0)
print(np.mean(np.power(ratings[comp_i] - predicted_ratings[comp_i], 2)))

Found users' preferences:
[[5.         1.        ]
 [4.77777778 2.        ]
 [0.97014925 4.70149254]
 [1.68539326 4.83146067]] 

Predicted ratings:
[[5.         4.77777778 0.97014925 1.68539326]
 [4.6        4.5        1.34328358 2.        ]
 [1.5        2.47777778 4.79850746 5.        ]
 [1.         2.         4.70149254 4.83146067]] 

0.0199004975124378


In [0]:
# Set of x_1, x_2 for example movies.
movies_features = []

# Set of ratings for movies by users. Users are in columns. -1 means no rating.
ratings = np.array(
  [[ 5.0, -1.0,  1.0, -1.0],
   [-1.0,  4.5, -1.0,  2.0],
   [-1.0, -1.0,  4.5,  5.0],
   [ 1.0,  2.0,  5.0, -1.0]]
)

# Set of preferences of users for x_1 and x_2 feature of the movies.
users_preferences = np.array(
  [[5.0       , 1.0       ],
   [4.77777778, 2.0       ],
   [0.97014925, 4.70149254],
   [1.68539326, 4.83146067]] 
)

for movie_id in [0, 1, 2, 3]:
  valid_ratings_indices = np.where(ratings[movie_id, :] != -1.0)
  valid_ratings = ratings[movie_id, valid_ratings_indices][0]
  rated_users_preferences = users_preferences[valid_ratings_indices]
  
  cechy = np.linalg.lstsq(
      rated_users_preferences, valid_ratings, rcond=None)[0]
  movies_features.append(cechy)
  
movies_features = np.array(movies_features)
print("Found movies' features:")
print(movies_features, "\n")

predicted_ratings = np.dot(movies_features, user_preferences.T)
print("Predicted ratings:")
print(predicted_ratings, "\n")

comp_i = np.where(ratings != -1.0)
print(np.mean(np.power(ratings[comp_i] - predicted_ratings[comp_i], 2)))

Found movies' features:
[[ 0.9986755   0.00662252]
 [ 0.9         0.1       ]
 [ 0.54559586  0.84455958]
 [-0.01984558  1.06577666]] 

Predicted ratings:
[[5.         4.78469463 1.         1.71515738]
 [4.6        4.5        1.34328358 2.        ]
 [3.57253888 4.29585494 4.5        5.        ]
 [0.96654876 2.03673555 4.99148784 5.11581041]] 

0.00028232698865541264


In [0]:
# Set of x_1, x_2 for example movies.
movies_features = np.array(
  [[37.1, 1.1],
   [37.1, 1.1],
   [37.1, 1.1],
   [37.1, 1.1]]
)

# Set of ratings for movies by users. Users are in columns. -1 means no rating.
ratings = np.array(
  [[ 5.0, -1.0,  1.0, -1.0],
   [-1.0,  4.5, -1.0,  2.0],
   [-1.0, -1.0,  4.5,  5.0],
   [ 1.0,  2.0,  5.0, -1.0]]
)

# Set of preferences of users for x_1 and x_2 feature of the movies.
users_preferences = []

for i in range(100):
  users_preferences = []
  for user_id in [0, 1, 2, 3]:
    valid_ratings_indices = np.where(ratings[:, user_id] > -1.0)
    valid_ratings = ratings[valid_ratings_indices, user_id][0]
    rated_movies = movies_features[valid_ratings_indices]

    clf = Ridge(alpha=0.5, fit_intercept=False)
    clf.fit(rated_movies, valid_ratings)
    users_preferences.append(clf.coef_)

  users_preferences = np.array(users_preferences)

  movies_features = []
  
  for movie_id in [0, 1, 2, 3]:
    valid_ratings_indices = np.where(ratings[movie_id, :] > -1.0)
    valid_ratings = ratings[movie_id, valid_ratings_indices][0]
    rated_users_preferences = users_preferences[valid_ratings_indices]

    clf = Ridge(alpha=0.5, fit_intercept=False)
    clf.fit(rated_users_preferences, valid_ratings)
    movies_features.append(clf.coef_)

  movies_features = np.array(movies_features)
  predicted_ratings = np.dot(movies_features, users_preferences.T)
  comp_i = np.where(ratings > -1.0)
  print(np.mean(np.power(ratings[comp_i] - predicted_ratings[comp_i], 2)))
  

movies_features = np.array(movies_features)
print("Found movies' features:")
print(movies_features, "\n")
  
users_preferences = np.array(users_preferences)
print("Found users' preferences:")
print(users_preferences, "\n")

predicted_ratings = np.dot(movies_features, users_preferences.T)
print("Predicted ratings:")
print(predicted_ratings, "\n")

comp_i = np.where(ratings > -1.0)
print(np.mean(np.power(ratings[comp_i] - predicted_ratings[comp_i], 2)))

13.13713222992114
2.4066305940175203
2.249214032238978
2.187705773092429
2.161149954469215
2.1489994180176044
2.143323936363335
2.1384289725994434
0.4349520211368014
0.2378970853318673
0.16099099086943194
0.13003044319642607
0.1178204949506318
0.11273655955048761
0.11056941903163038
0.1096853377062117
0.10938934755696562
0.10936556572005936
0.10946379510935048
0.1096106407951437
0.10977000530999266
0.1099245579957567
0.11006661543349289
0.11019346928531136
0.11030492309501977
0.11040197469591055
0.11048611229316702
0.11055894587627171
0.11062202326580393
0.11067674735899086
0.11072434781856484
0.11076588092612243
0.11080224291741587
0.11083418873089526
0.11086235187576424
0.1108872632675755
0.1109093680772902
0.11092904029219518
0.11094659502601302
0.11096229877651975
0.11097637789102976
0.11098902550930329
0.11100040723618008
0.11101066576754594
0.11101992466124316
0.11102829141352673
0.11103585997364226
0.11104271280478858
0.11104892257919832
0.11105455357804145
0.11105966285293217
0

In [0]:
#@title load_movielens_data

(msmall_ratings, msmall_movies, 
 msmall_n_users, msmall_n_movies, 
 msmall_n_ratings) = load_movielens_dataset(
    'ml-latest-small/ratings.csv', 'ml-latest-small/movies.csv', 
    ['user_id', 'movie_id', 'rating', 'time'], ['movie_id', 'title', 'genre'],
    ',', ',', 1, 'user_id', 'movie_id')

(m100k_ratings, m100k_movies, 
 m100k_n_users, m100k_n_movies, 
 m100k_n_ratings) = load_movielens_dataset(
    'ml-100k/u.data', 'ml-100k/u.item', 
    ['user_id', 'movie_id', 'rating', 'time'], 
    ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'],
    '\t', '|', 0, 'user_id', 'movie_id')

(m1m_ratings, m1m_movies, 
 m1m_n_users, m1m_n_movies, 
 m1m_n_ratings) = load_movielens_dataset(
    'ml-1m/ratings.dat', 'ml-1m/movies.dat', 
    ['user_id', 'movie_id', 'rating', 'time'], ['movie_id', 'title', 'genres'],
    '::', '::', 0, 'user_id', 'movie_id')

print(msmall_n_users, msmall_n_movies, msmall_n_ratings, calc_sparsity(msmall_n_ratings, msmall_n_users, msmall_n_movies))
print(m100k_n_users, m100k_n_movies, m100k_n_ratings, calc_sparsity(m100k_n_ratings, m100k_n_users, m100k_n_movies))
print(m1m_n_users, m1m_n_movies, m1m_n_ratings, calc_sparsity(m1m_n_ratings, m1m_n_users, m1m_n_movies))


610 9724 100836 0.9830003169443864
943 1682 100000 0.9369533063577546
6040 3706 1000209 0.9553163743776871


In [0]:
#@title Default title text
items, preferences, algo = find_ratings_decomposition(m1m_ratings[["user_id", "movie_id", "rating"]])
 print(items.shape, preferences.shape, algo)

(3706, 100) (6040, 100) <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7f23441c1b70>


In [0]:
print(m1m_movies.iloc[0])

In [0]:
print(items[algo.trainset.to_inner_iid(1)])

In [0]:
toy_story_index = algo.trainset.to_inner_iid(1)
close_indices = find_closest_points(items[toy_story_index], items)
for i in close_indices:
  print(m1m_movies.loc[m1m_movies['movie_id'] == algo.trainset.to_raw_iid(i)])

   movie_id             title                       genres
0         1  Toy Story (1995)  Animation|Children's|Comedy
      movie_id               title                       genres
3045      3114  Toy Story 2 (1999)  Animation|Children's|Comedy
      movie_id                 title                       genres
2286      2355  Bug's Life, A (1998)  Animation|Children's|Comedy
     movie_id           title                               genres
584       588  Aladdin (1992)  Animation|Children's|Comedy|Musical
      movie_id                           title            genres
1180      1198  Raiders of the Lost Ark (1981)  Action|Adventure
      movie_id              title  genres
2415      2484  Tinseltown (1998)  Comedy
      movie_id                       title            genres
1132      1148  Wrong Trousers, The (1993)  Animation|Comedy
      movie_id                                        title      genres
2828      2897  And the Ship Sails On (E la nave va) (1984)  Comedy|War
     mov

In [0]:
find_closest_points([1, 2], np.array([[1, 2], [1, 2], [2, 4]]))

array([0, 1, 2])

In [0]:
#@title Default title text
# Set of x_1, x_2 for example movies.
movies_features = np.array(
  [[1.0, 0.0],
   [0.9, 0.1],
   [0.1, 1.0],
   [0.0, 1.0]]
)

# Set of ratings for movies by users. Users are in columns. -1 means no rating.
ratings = np.array(
  [[ 5.0, -1.0,  1.0, -1.0],
   [-1.0,  4.5, -1.0,  2.0],
   [-1.0, -1.0,  4.5,  5.0],
   [ 1.0,  2.0,  5.0, -1.0]]
)

# Set of preferences of users for x_1 and x_2 feature of the movies.
users_preferences = []

for user_id in [0, 1, 2, 3]:
  valid_ratings_indices = np.where(ratings[:, user_id] != -1.0)
  valid_ratings = ratings[valid_ratings_indices, user_id][0]
  rated_movies = movies_features[valid_ratings_indices]
  
  user_preferences = np.linalg.lstsq(rated_movies, valid_ratings, rcond=None)[0]
  users_preferences.append(user_preferences)
  
user_preferences = np.array(users_preferences)
print("Found users' preferences:")
print(user_preferences, "\n")

predicted_ratings = np.dot(movies_features, user_preferences.T)
print("Predicted ratings:")
print(predicted_ratings, "\n")

In [0]:
#@title Default title text
# Set of x_1, x_2 for example movies.
movies_features = []

# Set of ratings for movies by users. Users are in columns. -1 means no rating.
ratings = np.array(
  [[ 5.0, -1.0,  1.0, -1.0],
   [-1.0,  4.5, -1.0,  2.0],
   [-1.0, -1.0,  4.5,  5.0],
   [ 1.0,  2.0,  5.0, -1.0]]
)

# Set of preferences of users for x_1 and x_2 feature of the movies.
users_preferences = np.array(
  [[5.0       , 1.0       ],
   [4.77777778, 2.0       ],
   [0.97014925, 4.70149254],
   [1.68539326, 4.83146067]] 
)

for movie_id in [0, 1, 2, 3]:
  valid_ratings_indices = np.where(ratings[:, movie_id] != -1.0)
  valid_ratings = ratings[valid_ratings_indices, movie_id][0]
  rated_users_preferences = users_preferences[valid_ratings_indices]
  
  movie_features = np.linalg.lstsq(
      rated_users_preferences, valid_ratings, rcond=None)[0]
  movies_features.append(movie_features)
  
movies_features = np.array(movies_features)
print("Found movies' features:")
print(movies_features, "\n")

predicted_ratings = np.dot(movies_features, user_preferences.T)
print("Predicted ratings:")
print(predicted_ratings, "\n")

Found movies' features:
[[ 1.0305     -0.1525    ]
 [ 0.9         0.1       ]
 [ 0.00632135  0.99472651]
 [-0.02909091  1.06949495]] 

Predicted ratings:
[[5.         4.6185     0.28276119 1.        ]
 [4.6        4.5        1.34328358 2.        ]
 [1.02633325 2.01965502 4.68283191 4.81663597]
 [0.9240404  2.         5.         5.11819316]] 

