In [7]:
!pip install matplotlib

Collecting matplotlib
  Using cached matplotlib-3.3.4-cp36-cp36m-manylinux1_x86_64.whl (11.5 MB)
Collecting cycler>=0.10
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting pillow>=6.2.0
  Using cached Pillow-8.4.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.3.1-cp36-cp36m-manylinux1_x86_64.whl (1.1 MB)
Installing collected packages: pillow, kiwisolver, cycler, matplotlib
Successfully installed cycler-0.11.0 kiwisolver-1.3.1 matplotlib-3.3.4 pillow-8.4.0


In [28]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt

uncomment if you need to download the data (you shouldn't need to)

In [9]:
# !wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
# !unzip ml-1m.zip

--2023-05-12 16:11:19--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2023-05-12 16:11:19 (27.6 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



set up all of the data (see below)

In [67]:
base_path = './ml-1m/'
users = 'users.dat'
ratings = 'ratings.dat'
movies = 'movies.dat'

ratings_file = base_path + ratings
ratings = pd.read_fwf(ratings_file,header=None)

In [68]:
ratings = ratings[0].str.split('::',expand=True)
ratings.columns=['UserID','MovieID','Rating','Timestamp']

In [69]:
ratings

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716
1000205,6040,1094,5,956704
1000206,6040,562,5,9567047
1000207,6040,1096,4,956715


format the data into integer encodings

In [70]:
user_ids = ratings["UserID"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}

In [71]:
movie_ids = ratings["MovieID"].unique().tolist()
movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
ratings["user"] = ratings["UserID"].map(user2user_encoded)
ratings["movie"] = ratings["MovieID"].map(movie2movie_encoded)

In [72]:
num_users = len(user2user_encoded)
num_movies = len(movie_encoded2movie)
ratings["Rating"] = ratings["Rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(ratings["Rating"])
max_rating = max(ratings["Rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 6040, Number of Movies: 3706, Min rating: 1.0, Max rating: 5.0


In [76]:
embedding_size = 64

In [83]:
user_list = np.arange(num_users)
movie_list = np.arange(num_movies)

In [90]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(num_users, embedding_size, embeddings_initializer="he_normal",
    embeddings_regularizer=keras.regularizers.l2(1e-6),))

model.compile('rmsprop', 'mse')
user_output_array = model.predict(user_list)
print(user_output_array.shape)

(6040, 1, 64)


In [91]:
user_output_array[0]

array([[-0.00201827,  0.00220001,  0.01370253,  0.00446202,  0.02656988,
        -0.00189669, -0.00050636,  0.02338769, -0.01209566, -0.00760534,
         0.03552527,  0.00922816,  0.01937199, -0.02198987, -0.0314478 ,
        -0.00733486, -0.02317738,  0.00191434, -0.02590405,  0.0094655 ,
        -0.00390186,  0.03269314,  0.0043453 ,  0.01694017,  0.00199145,
         0.03558206, -0.00380436, -0.01307326, -0.02428009,  0.00618998,
        -0.00656862,  0.00146781, -0.01443519, -0.03294548,  0.00474   ,
        -0.01284825,  0.00871075,  0.02515688, -0.01708282,  0.02950909,
         0.01293394,  0.02505983,  0.02693277,  0.01311709, -0.01407256,
        -0.0381386 , -0.00298436, -0.01358492, -0.02323157,  0.02953929,
        -0.00817657, -0.02075671,  0.00374398, -0.00760533,  0.01281405,
        -0.01005399, -0.00862692,  0.01758536,  0.01023781,  0.0028743 ,
         0.0304056 , -0.01757058,  0.01101611, -0.01850758]],
      dtype=float32)

In [92]:
movie_model = tf.keras.Sequential()
movie_model.add(tf.keras.layers.Embedding(num_movies, embedding_size, embeddings_initializer="he_normal",
    embeddings_regularizer=keras.regularizers.l2(1e-6),))

movie_model.compile('rmsprop', 'mse')
movie_output_array = movie_model.predict(movie_list)
print(movie_output_array.shape)

(3706, 1, 64)


In [93]:
movie_output_array[0]

array([[-0.00164271,  0.03639622,  0.03193511,  0.00403752, -0.03799168,
         0.02667775, -0.04245339, -0.00683967, -0.02962334, -0.03985883,
         0.01220991,  0.01975949, -0.0270016 ,  0.01240345, -0.02689143,
        -0.02647365, -0.00732935, -0.02047732, -0.00193214, -0.01083197,
         0.00936381,  0.00692771,  0.05150883, -0.01427614, -0.01325178,
        -0.02982868, -0.0055328 ,  0.05086528,  0.00240237,  0.01065781,
        -0.02471501, -0.01326261, -0.04487265, -0.0120981 ,  0.03848959,
         0.02643524,  0.01983233, -0.03209684, -0.01454915,  0.05056847,
        -0.01472603,  0.03134091, -0.01475613,  0.04692023, -0.0288157 ,
         0.01581022,  0.02604872,  0.01094291, -0.00632021,  0.00514529,
         0.00335008, -0.01018547,  0.00810121, -0.00649632, -0.01765954,
         0.00545621, -0.03073187, -0.02030215, -0.00044466, -0.02714276,
         0.00035615,  0.00495957, -0.03475268,  0.01304822]],
      dtype=float32)