In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, Concatenate
from tensorflow.keras.models import Model

In [2]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip
!unzip -n ml-20m.zip

--2022-11-26 11:56:30--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: ‘ml-20m.zip’


2022-11-26 11:56:32 (104 MB/s) - ‘ml-20m.zip’ saved [198702078/198702078]

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [3]:
df=pd.read_csv('ml-20m/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [4]:
df['rating'].value_counts()

4.0    5561926
3.0    4291193
5.0    2898660
3.5    2200156
4.5    1534824
2.0    1430997
2.5     883398
1.0     680732
1.5     279252
0.5     239125
Name: rating, dtype: int64

In [5]:
print(df['userId'].nunique(), df['movieId'].nunique())
print(df['userId'].max(), df['movieId'].max())

138493 26744
138493 131262


In [6]:
df['new_userId']  = pd.Categorical(df['userId']).codes
df['new_movieId'] = pd.Categorical(df['movieId']).codes
print(df['new_userId'].nunique(), df['new_movieId'].nunique())
print(df['new_userId'].max(), df['new_movieId'].max())

138493 26744
138492 26743


In [7]:
num_user = df['new_userId'].nunique()
num_movie = df['new_movieId'].nunique()

In [8]:
dim = 10;
user = Input(shape=(1,));
user_emb = Embedding(num_user,dim)(user);
movie = Input(shape=(1,));
movie_emb = Embedding(num_movie, dim)(movie);
u_emb = Flatten()(user_emb);
m_emb = Flatten()(movie_emb);
x = Concatenate()([u_emb, m_emb]);
x = Dense(100, activation='relu')(x);
x = Dense(20, activation='relu')(x);
x = Dense(1)(x);
model = Model(inputs=[user, movie], outputs=x);
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 10)        1384930     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 10)        267440      ['input_2[0][0]']                
                                                                                              

In [9]:
df.drop(columns=['userId','movieId','timestamp'],inplace=True)
df.head()

Unnamed: 0,rating,new_userId,new_movieId
0,3.5,0,1
1,3.5,0,28
2,3.5,0,31
3,3.5,0,46
4,3.5,0,49


In [10]:
from sklearn.model_selection import train_test_split
X = df[['new_userId','new_movieId']]
y = df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)
train_user, train_movie = X_train['new_userId'], X_train['new_movieId']
test_user, test_movie = X_test['new_userId'], X_test['new_movieId']

In [11]:
model.compile(optimizer='adam', loss='mse');
history = model.fit(x=[train_user, train_movie], y = y_train, batch_size= 4096, epochs=10, verbose=2, 
                    validation_data=([test_user, test_movie], y_test))

Epoch 1/10
3907/3907 - 21s - loss: 0.9112 - val_loss: 0.7078 - 21s/epoch - 5ms/step
Epoch 2/10
3907/3907 - 17s - loss: 0.6860 - val_loss: 0.6812 - 17s/epoch - 4ms/step
Epoch 3/10
3907/3907 - 17s - loss: 0.6609 - val_loss: 0.6721 - 17s/epoch - 4ms/step
Epoch 4/10
3907/3907 - 17s - loss: 0.6396 - val_loss: 0.6583 - 17s/epoch - 4ms/step
Epoch 5/10
3907/3907 - 17s - loss: 0.6202 - val_loss: 0.6528 - 17s/epoch - 4ms/step
Epoch 6/10
3907/3907 - 17s - loss: 0.6051 - val_loss: 0.6471 - 17s/epoch - 4ms/step
Epoch 7/10
3907/3907 - 17s - loss: 0.5917 - val_loss: 0.6454 - 17s/epoch - 4ms/step
Epoch 8/10
3907/3907 - 17s - loss: 0.5816 - val_loss: 0.6428 - 17s/epoch - 4ms/step
Epoch 9/10
3907/3907 - 16s - loss: 0.5742 - val_loss: 0.6430 - 16s/epoch - 4ms/step
Epoch 10/10
3907/3907 - 17s - loss: 0.5683 - val_loss: 0.6431 - 17s/epoch - 4ms/step
