In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc
from sklearn.preprocessing import LabelEncoder
import regex as re
import tensorflow as tf
import tensorflow_recommenders as tfrs
from typing import Dict, Text  # Dict 임포트 추가
font_path = "C:/Windows/Fonts/malgun.ttf"
# font = font_manager.FontProperties(fname = font_path).get_name()
# rc('font', family = font)

In [2]:
#pip install tensorflow_recommenders

In [3]:
train1 = pd.read_csv("train.csv")
train=train1.copy()
train.shape

(871393, 10)

In [3]:
display(train.head(3))

Unnamed: 0,ID,User-ID,Book-ID,Book-Rating,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,TRAIN_000000,USER_00000,BOOK_044368,8,23.0,"sackville, new brunswick, canada",Road Taken,Rona Jaffe,2001.0,Mira
1,TRAIN_000001,USER_00000,BOOK_081205,8,23.0,"sackville, new brunswick, canada",Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books
2,TRAIN_000002,USER_00000,BOOK_086781,0,23.0,"sackville, new brunswick, canada",Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books


In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
from typing import Dict, Text
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
# 데이터 불러오기
train1 = pd.read_csv("train.csv")
train_data=train1.copy()
# 필요한 열만 선택
train_data = train_data[["User-ID", "Book-Rating", "Book-Title"]]


# TensorFlow 데이터셋으로 변환
ratings = tf.data.Dataset.from_tensor_slices({
        "book_title": train_data["Book-Title"].values,
        "user_id": train_data["User-ID"].values,
        "book_rating": train_data["Book-Rating"].values
})
# 데이터셋 분할 비율 설정
train_size = int(len(train_data) * 0.8)
test_size = len(train_data) - train_size

# 훈련 및 테스트 세트로 분할
train = ratings.take(train_size)
test = ratings.skip(train_size)

# 도서 제목과 사용자 ID 추출
unique_user_ids = np.unique(train_data["User-ID"].values)
unique_book_titles = np.unique(train_data["Book-Title"].values)

# 정보 출력
print("First 10 unique book titles:", list(unique_book_titles)[:10])
print("Number of unique book titles:", len(unique_book_titles))
print("Number of unique user ids:", len(unique_user_ids))

First 10 unique book titles: [' A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)', ' Always Have Popsicles', " Apple Magic (The Collector's series)", ' Ask Lily (Young Women of Faith: Lily Series, Book 5)', ' Beyond IBM: Leadership Marketing and Finance for the 1990s', ' Clifford Visita El Hospital (Clifford El Gran Perro Colorado)', ' Deceived', ' Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth', ' Final Fantasy Anthology: Official Strategy Guide (Brady Games)', ' Flight of Fancy: American Heiresses (Zebra Ballad Romance)']
Number of unique book titles: 217829
Number of unique user ids: 83256


## TFRS - RetrievalModel

In [16]:
class RetrievalModel(tf.keras.Model):
  def __init__(self):
    super().__init__()
    embedding_dimension = 32 # 임베딩 벡터 차원
    # 순차적으로 쌓기 위해(문자열-> 정수인덱스)
    self.user_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_user_ids, mask_token=None), # 모든 토큰 처리
      
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension) # 추가적인 임베딩 생성
    ])

    self.book_model = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_book_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_book_titles) + 1, embedding_dimension)
    ])

In [17]:
# 추천 책에 대한 임베딩 생성을 위한 데이터 셋 준비
class BookRetrievalModel(tfrs.Model):

  def __init__(self):
    super().__init__()

    self.retrieval_model = RetrievalModel() #북 추천 모델
    self.task = tfrs.tasks.Retrieval(
      metrics = tfrs.metrics.FactorizedTopK(
        candidates = tf.data.Dataset.from_tensor_slices(train_data['Book-Title']).batch(128).map(self.retrieval_model.book_model) # 책 제목을 개별적인 텐서로 나눔
      )
    )
  # loss 값 계산
  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor: # 학습모드 여부
    
    user_embeddings = self.retrieval_model.user_model(features["user_id"]) # 사용자 ID에 대한 사용자 임베딩을 추출
    positive_book_embeddings = self.retrieval_model.book_model(features["book_title"]) # 책 제목에 대한 사용자 임베딩을 추출

    return self.task(user_embeddings, positive_book_embeddings)

In [18]:
retrieval_model = BookRetrievalModel()
retrieval_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [19]:
cached_train = train.cache()
cached_test = test.cache()

In [21]:
retrieval_model.fit(cached_train, epochs=3)

Epoch 1/3


     2/697114 [..............................] - ETA: 151:05:43 - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_100_categorical_accuracy: 0.0000e+00 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00

KeyboardInterrupt: 

## TFRS - Ranking model

In [30]:
import tensorflow as tf
import numpy as np
import pandas as pd
from typing import Dict, Text
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import tensorflow_recommenders as tfrs

# 데이터 불러오기
train1 = pd.read_csv("train.csv")
train_data = train1.copy()

# 필요한 열만 선택
train_data = train_data[["User-ID", "Book-Rating", "Book-Title"]]

# Book-Rating 값에 따른 클래스 분포 확인
class_distribution = train_data["Book-Rating"].value_counts()
print("Class Distribution:\n", class_distribution)

# 오버샘플링 적용
oversampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(train_data.drop(columns=["Book-Rating"]), train_data["Book-Rating"])

# 샘플링 결과 확인
print("Resampled Class Distribution:\n", pd.Series(y_resampled).value_counts())

# 사용자 ID와 책 제목에 StringLookup 레이어 적용
user_ids_vocabulary = np.unique(X_resampled["User-ID"])
book_titles_vocabulary = np.unique(X_resampled["Book-Title"])

user_ids_lookup = tf.keras.layers.StringLookup(vocabulary=user_ids_vocabulary, mask_token=None)
book_titles_lookup = tf.keras.layers.StringLookup(vocabulary=book_titles_vocabulary, mask_token=None)

# TensorFlow 데이터셋으로 변환
ratings = tf.data.Dataset.from_tensor_slices({
        "user_id": user_ids_lookup(X_resampled["User-ID"]),
        "book_title": book_titles_lookup(X_resampled["Book-Title"]),
        "book_rating": y_resampled
})

# 데이터셋 분할 비율 설정
train_size = int(len(X_resampled) * 0.8)
test_size = len(X_resampled) - train_size

# 훈련 및 테스트 세트로 분할
train = ratings.take(train_size)
test = ratings.skip(train_size)

# RankingModel 정의
class RankingModel(tf.keras.Model):
    def __init__(self, user_embedding_dimension=64, book_embedding_dimension=64):
        super().__init__()
        
        self.user_embeddings = tf.keras.Sequential([
            tf.keras.layers.Embedding(len(user_ids_vocabulary) + 1, user_embedding_dimension)
        ])
        
        self.book_embeddings = tf.keras.Sequential([
            tf.keras.layers.Embedding(len(book_titles_vocabulary) + 1, book_embedding_dimension)
        ])

        # predictions
        self.ratings = tf.keras.Sequential([
            # Learn multiple dense layers.
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            # Make rating predictions in the final layer.
            tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):
        user_id, book_title = inputs

        user_embedding = self.user_embeddings(user_id)
        book_embedding = self.book_embeddings(book_title)

        return self.ratings(tf.concat([user_embedding, book_embedding], axis=1)) # 최종 평점 예측

# BookRankingModel 정의
class BookRankingModel(tfrs.models.Model):
    def __init__(self, user_embedding_dimension=64, book_embedding_dimension=64):
        super().__init__()
        self.ranking_model = RankingModel(user_embedding_dimension, book_embedding_dimension)
        self.task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(), 
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features):
        return self.ranking_model(
            (features["user_id"], features["book_title"])
        )

    def compute_loss(self, features, training=False):
        labels = features["book_rating"]
        rating_predictions = self(features)
        return self.task(labels=labels, predictions=rating_predictions)

# BookRankingModel 클래스 초기화
ranking_model = BookRankingModel()
ranking_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

cached_train = train.batch(4096).cache()
cached_test = test.batch(4096).cache()

# 모델 훈련
history = ranking_model.fit(cached_train, epochs=5, validation_data=cached_test)

# 손실 값 확인
train_loss = history.history['loss']
val_loss = history.history['val_loss']


Class Distribution:
 Book-Rating
0     548804
8      76971
10     60024
7      55852
9      50494
5      38416
6      26670
4       6462
3       4374
2       2019
1       1307
Name: count, dtype: int64
Resampled Class Distribution:
 Book-Rating
8     548804
0     548804
5     548804
9     548804
7     548804
6     548804
10    548804
2     548804
3     548804
4     548804
1     548804
Name: count, dtype: int64
Epoch 1/5




[1m1180/1180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 57ms/step - loss: 3.0934 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 2.7057 - total_loss: 3.0934 - val_loss: 4.2893 - val_regularization_loss: 0.0000e+00 - val_root_mean_squared_error: 1.4911 - val_total_loss: 4.2893
Epoch 2/5
[1m1180/1180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 53ms/step - loss: 3.1173 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 2.7213 - total_loss: 3.1173 - val_loss: 5.8159 - val_regularization_loss: 0.0000e+00 - val_root_mean_squared_error: 1.7911 - val_total_loss: 5.8159
Epoch 3/5
[1m1180/1180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 54ms/step - loss: 3.9179 - regularization_loss: 0.0000e+00 - root_mean_squared_error: 3.1897 - total_loss: 3.9179 - val_loss: 4.3978 - val_regularization_loss: 0.0000e+00 - val_root_mean_squared_error: 1.5081 - val_total_loss: 4.3978
Epoch 4/5
[1m1180/1180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [40]:
# 테스트 데이터셋에서 예측 평점 및 실제 평점 가져오기
predicted_ratings = ranking_model.predict(cached_test).flatten()
actual_ratings = np.concatenate([sample["book_rating"].numpy() for sample in cached_test])

# 데이터프레임 생성
result = pd.DataFrame({'predict_rating': predicted_ratings, 'actual_ranking': actual_ratings})

# 결과 출력
print(result)

[1m  1/295[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step

[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
         predict_rating  actual_ranking
0              7.999862               8
1              8.407794               8
2              7.977655               8
3              8.222039               8
4              8.007927               8
...                 ...             ...
1207364        8.037879              10
1207365        8.767473              10
1207366        8.032455              10
1207367        6.193080              10
1207368        7.496315              10

[1207369 rows x 2 columns]


In [47]:
# 테스트 데이터셋에서 상위 5개 도서를 평가하는 코드
user_rand_index = np.where(unique_user_ids == 'USER_00000')[0][0]
test_ratings = {}

for sample in test.take(5):
    user_id = tf.convert_to_tensor([sample['user_id'].numpy()])
    book_title = tf.convert_to_tensor([sample['book_title'].numpy()])
    rating_predictions = ranking_model({
        'user_id': user_id,
        'book_title': book_title
    })
    
    book_title_str = unique_book_titles[sample["book_title"].numpy()]  # 실제 책 제목 추출
    test_ratings[book_title_str] = rating_predictions.numpy().flatten()[0]

print("사용자 {}를 위한 상위 5개 권장 제품: ".format(unique_user_ids[user_rand_index]))
for title in sorted(test_ratings, key=test_ratings.get, reverse=True):
    print(title)

사용자 USER_00000를 위한 상위 5개 권장 제품: 
The Pulse of Enterprise: Timeframe Ad 1800-1850 (Time Frame)
Woman Without A Name (Harlequin Intimate Moments, No 751)
And the Coyotes Howled: Family Adventures in Pleasant Valley
The Seven Years in Tibet: Screenplay and Story Behind the Film (Newmarket Pictorial Moviebook)
Mornings A Seven (Harlequin Desire, No 659)


In [48]:
train1[train1['User-ID'] == 'USER_00000']

Unnamed: 0,ID,User-ID,Book-ID,Book-Rating,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,TRAIN_000000,USER_00000,BOOK_044368,8,23.0,"sackville, new brunswick, canada",Road Taken,Rona Jaffe,2001.0,Mira
1,TRAIN_000001,USER_00000,BOOK_081205,8,23.0,"sackville, new brunswick, canada",Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books
2,TRAIN_000002,USER_00000,BOOK_086781,0,23.0,"sackville, new brunswick, canada",Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books
3,TRAIN_000003,USER_00000,BOOK_098622,0,23.0,"sackville, new brunswick, canada",Mother Earth Father Sky,Sue Harrison,1991.0,Avon
4,TRAIN_000004,USER_00000,BOOK_180810,8,23.0,"sackville, new brunswick, canada",She Who Remembers,Linda Lay Shuler,1989.0,Signet Book
5,TRAIN_000005,USER_00000,BOOK_206799,5,23.0,"sackville, new brunswick, canada",Neuromancer (Remembering Tomorrow),William Gibson,1995.0,Ace Books
6,TRAIN_000006,USER_00000,BOOK_239414,9,23.0,"sackville, new brunswick, canada",The Little Prince,Antoine de Saint-ExupÃ©ry,1982.0,Harvest Books
7,TRAIN_000007,USER_00000,BOOK_269070,0,23.0,"sackville, new brunswick, canada",Forests of the Heart (Newford),Charles de Lint,2001.0,Tor Books
