In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


### Data Load

In [3]:
movies = pd.read_csv('/gdrive/MyDrive/playdata/movie_recommendation/data/movies.csv')
movies.head(2)

Unnamed: 0,movie_id,title,release_date,imdb_url,all_genres
0,0,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,Animation-Children-Comedy
1,1,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,Action-Adventure-Thriller


In [4]:
users = pd.read_csv('/gdrive/MyDrive/playdata/movie_recommendation/data/users_update2.csv')
users.tail()

Unnamed: 0,user_id,age,sex,occupation
940,940,20,M,student
941,941,48,F,librarian
942,942,22,M,student
943,943,25,M,heonter
944,944,27,M,home protector


In [5]:
ratings = pd.read_csv('/gdrive/MyDrive/playdata/movie_recommendation/data/ratings_update2.csv')
ratings.tail()

Unnamed: 0,rating_id,user_id,movie_id,rating
100015,100016,944,285,4
100016,100017,944,287,4
100017,100018,944,0,4
100018,100019,944,299,4
100019,100020,944,120,4


- train, test set

In [6]:
# df를 train/test set으로 분할하는 함수 생성
# Args: df, holdoyt_fraction(분할 비율)
def split_dataframe(df, holdout_fraction=0.1):
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]

  return train, test

In [7]:
ratings['user_id'] = ratings['user_id'].astype(int)
ratings['movie_id'] = ratings['movie_id'].astype(int)

train_ratings, test_ratings = split_dataframe(ratings)
train_ratings.shape, test_ratings.shape

((90018, 4), (10002, 4))

### Matrix Factorization

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, dot

- Model 생성

In [None]:
users.shape[0], movies.shape[0]

(944, 1682)

In [9]:
n_latent_factors = 30 # user와 movie embedding의 차원수

user_input = Input(shape=[1], name='user')
movie_input = Input(shape=[1], name='movie')

user_embedding = Embedding(input_dim=users.shape[0] # 944
                           , output_dim = n_latent_factors # 30
                           , name='user_embedding'
                           )(user_input)

movie_embedding = Embedding(input_dim=movies.shape[0] # 1682
                           , output_dim = n_latent_factors # 30
                           , name='movie_embedding'
                           )(movie_input)

user_vec = Flatten(name='flatten_users')(user_embedding) # 1차원 배열로 변환
movie_vec = Flatten(name='flatten_movies')(movie_embedding) # 1차원 배열로 변환

product = dot([movie_vec, user_vec], axes=1) # 평점
model = Model(inputs=[user_input, movie_input], outputs=product) # user와 movie가 주어졌을 때, 평점을 예측하는 모델

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 user (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 30)        50460       ['movie[0][0]']                  
                                                                                                  
 user_embedding (Embedding)     (None, 1, 30)        28350       ['user[0][0]']                   
                                                                                              

- compile

In [10]:
model.compile(optimizer='adam', loss='mse')

- fit

In [11]:
history = model.fit(x=[train_ratings['user_id'], train_ratings['movie_id']],
                    y=train_ratings['rating'], epochs=500,
                    validation_data=([test_ratings['user_id'], test_ratings['movie_id'] ],
                                     test_ratings['rating']),
                    verbose=1, batch_size=train_ratings.shape[0])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [None]:
# import pickle
# with open('CFModel_01.pickle','wb') as fw:
#     pickle.dump(model, fw)

In [None]:
# with open('CFModel_01.pickle', 'rb') as f:
#     CFModel_01 = pickle.load(f)

In [12]:
model.save('CFModel03.h5')

In [None]:
import tensorflow as tf
CFModel02 = tf.keras.models.load_model('CFModel02.h5')
CFModel02.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 user (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 30)        50460       ['movie[0][0]']                  
                                                                                                  
 user_embedding (Embedding)     (None, 1, 30)        28320       ['user[0][0]']                   
                                                                                              