<a href="https://colab.research.google.com/github/spexlee/ML_practice/blob/main/NCF_MovieLens_test_%EC%9D%B4%EC%98%88%EB%B9%88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**서강대학교 정보통신대학원 데이터 사이언스 전공 이예빈**

## NCF

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Concatenate, Dense
from keras.callbacks import EarlyStopping

# 데이터셋 불러오기 (MovieLens 100K 데이터셋 사용)
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

# 데이터 로딩
df = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp'])

# 사용자 ID와 아이템 ID에 대해 레이블 인코딩
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df['user'] = user_encoder.fit_transform(df['user'])
df['item'] = item_encoder.fit_transform(df['item'])

# 훈련 데이터와 테스트 데이터로 분할
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# 훈련 데이터와 테스트 데이터 준비
train_users, train_items, train_ratings = train_data['user'].values, train_data['item'].values, train_data['rating'].values
test_users, test_items, test_ratings = test_data['user'].values, test_data['item'].values, test_data['rating'].values

n_users = df['user'].nunique()
n_items = df['item'].nunique()


--2023-12-04 14:13:37--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-12-04 14:13:39 (3.45 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base    

In [2]:
embedding_size = 50

# 입력 레이어
user_input = Input(shape=(1,))
item_input = Input(shape=(1,))

# 임베딩 레이어
user_embedding = Embedding(output_dim=embedding_size, input_dim=n_users)(user_input)
item_embedding = Embedding(output_dim=embedding_size, input_dim=n_items)(item_input)

# 벡터로 변환
user_vector = Flatten()(user_embedding)
item_vector = Flatten()(item_embedding)

# 연결 및 밀집 레이어
concat = Concatenate()([user_vector, item_vector])
dense = Dense(128, activation='relu')(concat)
output = Dense(1)(dense)

# 모델 생성 및 컴파일
model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')


In [3]:
# EarlyStopping 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# 모델 훈련
model.fit([train_users, train_items], train_ratings, epochs=5, batch_size=128, verbose=1, validation_split=0.1, callbacks=[early_stopping])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7e6517c1be20>

In [4]:
model.evaluate([test_users, test_items], test_ratings)




0.882853627204895

### Keras Functional API 사용

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Concatenate, Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# MovieLens 데이터셋 다운로드
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

# 데이터 로딩
df = pd.read_csv('ml-latest-small/ratings.csv')

# 레이블 인코딩
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
df['userId'] = user_encoder.fit_transform(df['userId'])
df['movieId'] = item_encoder.fit_transform(df['movieId'])

# 데이터셋 분할
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# 입력 및 출력 데이터 준비
train_users, train_movies, train_ratings = train_data['userId'].values, train_data['movieId'].values, train_data['rating'].values
test_users, test_movies, test_ratings = test_data['userId'].values, test_data['movieId'].values, test_data['rating'].values

n_users = df['userId'].nunique()
n_movies = df['movieId'].nunique()


--2023-12-04 14:23:48--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2023-12-04 14:23:50 (514 KB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [8]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense

embedding_size = 50

# 입력 레이어
user_input = Input(shape=(1,), name='user_input')
movie_input = Input(shape=(1,), name='movie_input')

# 사용자 임베딩
user_embedding = Embedding(output_dim=embedding_size, input_dim=n_users, input_length=1, name='user_embedding')(user_input)
user_vector = Flatten(name='flatten_user')(user_embedding)

# 영화 임베딩
movie_embedding = Embedding(output_dim=embedding_size, input_dim=n_movies, input_length=1, name='movie_embedding')(movie_input)
movie_vector = Flatten(name='flatten_movie')(movie_embedding)

# 연결
concat = Concatenate()([user_vector, movie_vector])

# 밀집 레이어
dense = Dense(128, activation='relu')(concat)
output = Dense(1)(dense)

# 모델 생성
model = Model(inputs=[user_input, movie_input], outputs=output)

# 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')


In [9]:
model.fit([train_users, train_movies], train_ratings, epochs=5, batch_size=64, verbose=1)
model.evaluate([test_users, test_movies], test_ratings)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.7685096859931946

### TesnsorFlow Subclassing API

In [11]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

class NCFModel(tf.keras.Model):
    def __init__(self, num_users, num_items, embedding_size, **kwargs):
        super(NCFModel, self).__init__(**kwargs)
        self.user_embedding = tf.keras.layers.Embedding(num_users, embedding_size, input_length=1)
        self.item_embedding = tf.keras.layers.Embedding(num_items, embedding_size, input_length=1)
        self.concat = tf.keras.layers.Concatenate()
        self.dense1 = tf.keras.layers.Dense(128, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1)

    def call(self, inputs):
        user_input, item_input = inputs
        user_vector = tf.keras.layers.Flatten()(self.user_embedding(user_input))
        item_vector = tf.keras.layers.Flatten()(self.item_embedding(item_input))
        concat = self.concat([user_vector, item_vector])
        x = self.dense1(concat)
        output = self.dense2(x)
        return output

model = NCFModel(n_users, n_movies, 50)
model.compile(optimizer='adam', loss='mean_squared_error')


In [12]:
model.fit([train_users, train_movies], train_ratings, epochs=5, batch_size=64, verbose=1)
model.evaluate([test_users, test_movies], test_ratings)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


0.7582489848136902