- 실습 데이터 : https://www.kaggle.com/datasets/zygmunt/goodbooks-10k

# Steps Overview:

1. 데이터 로드: pandas를 사용하여 데이터를 로드하고 전처리합니다.
2. 사용자-아이템 행렬 생성: 이 행렬은 사용자들이 아이템(책)에 대해 부여한 평점을 나타냅니다.
3. 아이템 간 유사도 계산: 코사인 유사도를 사용하여 평점을 기반으로 유사한 아이템을 찾습니다.
4. 추천 생성: 유사한 아이템(이웃)을 기반으로 아이템을 추천합니다.

In [1]:
# 구글 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv

# 1. 데이터 로드

In [3]:
# Load the datasets : 원본 데이터를 불러오는 경우, similarity계산시에 matrix가 굉장히 커서 out-of-memory가 발생합니다.
# Colab Pro인 경우 원본 데이터를 불러와서 사용해보세요 !
ratings = pd.read_csv("/content/drive/MyDrive/Work/24 DCC/samples/ratings.csv")
books = pd.read_csv("/content/drive/MyDrive/Work/24 DCC/samples/books.csv")
book_tags = pd.read_csv('/content/drive/MyDrive/Work/24 DCC/samples/book_tags.csv')
tags = pd.read_csv('/content/drive/MyDrive/Work/24 DCC/samples/tags.csv')
to_read = pd.read_csv('/content/drive/MyDrive/Work/24 DCC/samples/to_read.csv')

In [4]:
pd.set_option("display.max_columns", None)
books.head(5)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
ratings.head(5)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [6]:
ratings.info() # infos about samples, features and datatypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   user_id  99 non-null     int64
 1   book_id  99 non-null     int64
 2   rating   99 non-null     int64
dtypes: int64(3)
memory usage: 2.4 KB


# 2. 사용자-아이템 행렬 생성

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# 평점 데이터를 사용하여 사용자-아이템 행렬 생성
user_item_matrix = ratings.pivot_table(index='user_id', columns='book_id', values='rating')
user_item_matrix

book_id,2,5,8,13,14,18,21,23,24,26,27,28,32,33,35,36,42,45,47,50,55,58,65,70,84,86,87,102,103,111,113,123,145,158,184,194,200,219,222,249,255,258,260,264,268,287,297,301,315,325,337,350,362,373,388,413,476,479,492,493,495,529,575,614,640,660,693,772,778,867,964,1117,1237,1296,1432,1796,1937,1967,2172,2318,2584,2686,2732,2738,3020,3638,3753,4081,4622,5425,5556,6195,6351,8519,9114,9296
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1
1,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,,,,,,,,,,,,,5.0,,,3.0,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,5.0,,,,,,,,3.0,,3.0,,,,,3.0,,,,,
2,,,,,,,,,,4.0,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,5.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.0,,5.0,,,,,5.0,4.0,,,,,,5.0,,5.0
4,5.0,4.0,4.0,4.0,,5.0,5.0,5.0,5.0,3.0,5.0,3.0,4.0,3.0,5.0,5.0,5.0,4.0,,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,2.0,4.0,4.0,2.0,4.0,4.0,,4.0,4.0,4.0,4.0,2.0,,,3.0,,3.0,4.0,,,5.0,4.0,3.0,,4.0,4.0,4.0,3.0,,4.0,,4.0,,3.0,,3.0,3.0,5.0,,,,4.0,3.0,5.0,3.0,,,,,4.0,,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,,,
8,,,,,5.0,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,5.0,,,,,4.0,,5.0,,5.0,,4.0,,,,4.0,5.0,,,,,,3.0,,3.0,3.0,,,5.0,,5.0,,5.0,,,,5.0,5.0,,4.0,,,5.0,


In [10]:
# 결측값을 0으로 채움 (평점이 없는 경우 평점이 주어지지 않은 것으로 가정)
user_item_matrix.fillna(0, inplace=True)
user_item_matrix

book_id,2,5,8,13,14,18,21,23,24,26,27,28,32,33,35,36,42,45,47,50,55,58,65,70,84,86,87,102,103,111,113,123,145,158,184,194,200,219,222,249,255,258,260,264,268,287,297,301,315,325,337,350,362,373,388,413,476,479,492,493,495,529,575,614,640,660,693,772,778,867,964,1117,1237,1296,1432,1796,1937,1967,2172,2318,2584,2686,2732,2738,3020,3638,3753,4081,4622,5425,5556,6195,6351,8519,9114,9296
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0
4,5.0,4.0,4.0,4.0,0.0,5.0,5.0,5.0,5.0,3.0,5.0,3.0,4.0,3.0,5.0,5.0,5.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,2.0,4.0,4.0,2.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,2.0,0.0,0.0,3.0,0.0,3.0,4.0,0.0,0.0,5.0,4.0,3.0,0.0,4.0,4.0,4.0,3.0,0.0,4.0,0.0,4.0,0.0,3.0,0.0,3.0,3.0,5.0,0.0,0.0,0.0,4.0,3.0,5.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,3.0,0.0,0.0,5.0,0.0,5.0,0.0,5.0,0.0,0.0,0.0,5.0,5.0,0.0,4.0,0.0,0.0,5.0,0.0


In [11]:
# 아이템 기반 필터링을 위해 행렬을 전치하여 아이템-사용자 행렬로 변환 (사용자 기반 필터링은 어떻게 해야할까요 ?)
item_user_matrix = user_item_matrix.T
item_user_matrix

user_id,1,2,4,6,8
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.0,0.0,5.0,0.0,0.0
5,0.0,0.0,4.0,0.0,0.0
8,0.0,0.0,4.0,0.0,0.0
13,0.0,0.0,4.0,0.0,0.0
14,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...
6195,0.0,0.0,0.0,0.0,4.0
6351,0.0,0.0,0.0,4.0,0.0
8519,0.0,5.0,0.0,0.0,0.0
9114,0.0,0.0,0.0,0.0,5.0


# 3. 아이템 간 유사도 계산

In [12]:
# 아이템 간 코사인 유사도 행렬 계산
item_similarity_matrix = cosine_similarity(item_user_matrix)
item_similarity_matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 1.]])

In [13]:
# 유사도 행렬을 DataFrame으로 변환하여 쉽게 조작할 수 있도록 함
item_similarity_df = pd.DataFrame(item_similarity_matrix, index=item_user_matrix.index, columns=item_user_matrix.index)

# 아이템 유사도 행렬의 일부를 표시하여 확인
item_similarity_df.head()

book_id,2,5,8,13,14,18,21,23,24,26,27,28,32,33,35,36,42,45,47,50,55,58,65,70,84,86,87,102,103,111,113,123,145,158,184,194,200,219,222,249,255,258,260,264,268,287,297,301,315,325,337,350,362,373,388,413,476,479,492,493,495,529,575,614,640,660,693,772,778,867,964,1117,1237,1296,1432,1796,1937,1967,2172,2318,2584,2686,2732,2738,3020,3638,3753,4081,4622,5425,5556,6195,6351,8519,9114,9296
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1
2,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.6,1.0,1.0,1.0,0.6,1.0,1.0,1.0,1.0,0.0,1.0,0.624695,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.6,1.0,1.0,1.0,0.6,1.0,1.0,1.0,1.0,0.0,1.0,0.624695,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.6,1.0,1.0,1.0,0.6,1.0,1.0,1.0,1.0,0.0,1.0,0.624695,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.6,1.0,1.0,1.0,0.6,1.0,1.0,1.0,1.0,0.0,1.0,0.624695,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.780869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


# 4. 추천

In [14]:
import numpy as np

# 추천할 사용자를 선택
user_id = 4  # 예시 사용자 ID

# 아이템 유사도와 사용자의 과거 평점을 기반으로 미평가된 아이템의 평점 예측
predicted_ratings = {} # {key : value} = {item_id 1: predicted rating 1, item_id2 : predicted_rating 2}

In [15]:
# 사용자의 평점 가져오기
user_ratings = user_item_matrix.loc[user_id]
user_ratings

Unnamed: 0_level_0,4
book_id,Unnamed: 1_level_1
2,5.0
5,4.0
8,4.0
13,4.0
14,0.0
...,...
6195,0.0
6351,0.0
8519,0.0
9114,0.0


In [17]:
# 사용자가 아직 평가하지 않은 아이템 찾기
unrated_items = user_ratings[user_ratings == 0].index
unrated_items

Index([  14,   47,  194,  258,  260,  268,  301,  315,  362,  479,  493,  529,
        614,  772,  778,  867, 1432, 1796, 1937, 1967, 2318, 2584, 2686, 2732,
       2738, 3020, 3638, 3753, 4081, 4622, 5425, 5556, 6195, 6351, 8519, 9114,
       9296],
      dtype='int64', name='book_id')

In [18]:
# unrated_items의 모든 아이템에 대해 예측 평점 계산
for item in unrated_items:
    # 각 미평가된 아이템에 대한 유사도 계산
    similar_items = item_similarity_df[item]

    # 유사도 값의 상위 20%를 기준으로 선택
    similar_items_sorted = similar_items.sort_values(ascending=False)
    top_n_items = int(len(similar_items_sorted) * (20 / 100))
    top_similar_items = similar_items_sorted.iloc[:top_n_items]

    # 유사 아이템 간의 유사도를 가중치로 하여 사용자의 평점을 가중 평균하여 예측 평점 계산
    weighted_ratings_sum = 0 # 분자
    similarity_sum = 0 # 분모

    for item_id in top_similar_items.index:
        # 사용자가 평가한 아이템만 고려
        if user_ratings[item_id] > 0:
            similarity = top_similar_items[item_id]
            rating = user_ratings[item_id]
            weighted_ratings_sum += similarity * rating
            similarity_sum += similarity

    if similarity_sum != 0:
        predicted_rating = weighted_ratings_sum / similarity_sum
    else:
        predicted_rating = 0

    # 예측된 평점 저장
    predicted_ratings[item] = predicted_rating

In [20]:
# 예측된 평점을 정렬하여 상위 추천 항목 가져오기
top_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
top_recommendations

[(260, 3.0000000000000004),
 (301, 3.0000000000000004),
 (315, 3.0000000000000004),
 (2318, 3.0000000000000004),
 (2686, 3.0000000000000004),
 (3753, 3.0000000000000004),
 (4081, 3.0000000000000004),
 (8519, 3.0000000000000004),
 (9296, 3.0000000000000004),
 (14, 0),
 (47, 0),
 (194, 0),
 (258, 0),
 (268, 0),
 (362, 0),
 (479, 0),
 (493, 0),
 (529, 0),
 (614, 0),
 (772, 0),
 (778, 0),
 (867, 0),
 (1432, 0),
 (1796, 0),
 (1937, 0),
 (1967, 0),
 (2584, 0),
 (2732, 0),
 (2738, 0),
 (3020, 0),
 (3638, 0),
 (4622, 0),
 (5425, 0),
 (5556, 0),
 (6195, 0),
 (6351, 0),
 (9114, 0)]

In [22]:
# 사용자가 이미 평가한 항목을 제외하고 추천된 책 ID 출력
recommendations = [item[0] for item in top_recommendations]

print("사용자 {}에게 추천하는 책: {}".format(user_id, recommendations[:5]))

사용자 4에게 추천하는 책: [260, 301, 315, 2318, 2686]


# 5. 평가

#### Q. 어떤 유사도를 가진 추천이 가장 성능이 좋을까요 ?

In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

actual_ratings = []  # 실제 평점을 저장할 리스트
predicted_ratings = []  # 예측된 평점을 저장할 리스트

# 모든 사용자와 사용자가 평가한 아이템에 대해 반복
for user_id in user_item_matrix.index:
    user_ratings = user_item_matrix.loc[user_id]  # 특정 사용자의 평점 데이터를 가져옴

    for item in user_ratings[user_ratings > 0].index:  # 사용자가 평가한 각 아이템에 대해 반복
        # 실제 평점을 가져옴
        actual_rating = user_ratings[item]
        actual_ratings.append(actual_rating)  # 실제 평점을 리스트에 추가

        # 유사도 행렬에서 해당 아이템과 다른 아이템 간의 유사도를 가져옴
        similar_items = item_similarity_df[item]

        weighted_ratings_sum = 0  # 가중합을 초기화
        similarity_sum = 0  # 유사도 합을 초기화

        # 사용자가 평가한 다른 아이템들에 대해 반복
        for rated_item in user_ratings[user_ratings > 0].index:
            if rated_item != item:  # 현재 아이템과 다른 아이템에 대해서만 계산
                similarity = similar_items[rated_item]  # 두 아이템 간의 유사도
                rating = user_ratings[rated_item]  # 사용자가 평가한 다른 아이템의 평점

                weighted_ratings_sum += similarity * rating  # 유사도와 평점의 곱을 가중합에 추가
                similarity_sum += abs(similarity)  # 유사도의 절대값을 유사도 합에 추가

        if similarity_sum != 0:
            predicted_rating = weighted_ratings_sum / similarity_sum  # 가중평균을 사용하여 예측 평점을 계산
        else:
            predicted_rating = 0  # 유사도 합이 0인 경우 예측 평점을 0으로 설정

        predicted_ratings.append(predicted_rating)  # 예측된 평점을 리스트에 추가

# MSE와 MAE 계산
mse = mean_squared_error(actual_ratings, predicted_ratings)  # 실제 평점과 예측된 평점 간의 평균 제곱 오차를 계산
mae = mean_absolute_error(actual_ratings, predicted_ratings)  # 실제 평점과 예측된 평점 간의 평균 절대 오차를 계산

# MSE와 MAE 출력
print("MSE: {}, MAE: {}".format(mse, mae))  # 결과를 출력

MSE: 0.8490741677765032, MAE: 0.6848150750937727


In [27]:
print(f"MSE: {mse:.4f}, MAE: {mae:.4f}")

MSE: 0.8491, MAE: 0.6848


In [28]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# 유사도 함수들을 이용하여 유사도 행렬 생성
cosine_sim = cosine_similarity(item_user_matrix)
euclidean_sim = 1 / (1 + euclidean_distances(item_user_matrix))  # 거리에서 유사도로 변환
pearson_sim = np.corrcoef(item_user_matrix)

# 유사도 행렬을 DataFrame으로 변환
cosine_sim_df = pd.DataFrame(cosine_sim, index=item_user_matrix.index, columns=item_user_matrix.index)
euclidean_sim_df = pd.DataFrame(euclidean_sim, index=item_user_matrix.index, columns=item_user_matrix.index)
pearson_sim_df = pd.DataFrame(pearson_sim, index=item_user_matrix.index, columns=item_user_matrix.index)

# 평가를 위한 함수 정의
def evaluate_similarity(similarity_df):
    actual_ratings = []
    predicted_ratings = []

    # 모든 사용자와 사용자가 평가한 아이템에 대해 반복
    for user_id in user_item_matrix.index:
        user_ratings = user_item_matrix.loc[user_id]

        for item in user_ratings[user_ratings > 0].index:
            # 실제 평점
            actual_rating = user_ratings[item]
            actual_ratings.append(actual_rating)

            # 유사도 행렬을 사용하여 평점을 예측
            similar_items = similarity_df[item]

            weighted_ratings_sum = 0
            similarity_sum = 0

            # 사용자가 평가한 다른 아이템들에 대해 반복
            for rated_item in user_ratings[user_ratings > 0].index:
                if rated_item != item:
                    similarity = similar_items[rated_item]
                    rating = user_ratings[rated_item]

                    weighted_ratings_sum += similarity * rating
                    similarity_sum += abs(similarity)

            if similarity_sum != 0:
                predicted_rating = weighted_ratings_sum / similarity_sum
            else:
                predicted_rating = 0

            predicted_ratings.append(predicted_rating)

    # MSE와 MAE 계산
    mse = mean_squared_error(actual_ratings, predicted_ratings)
    mae = mean_absolute_error(actual_ratings, predicted_ratings)

    return mse, mae

# 각 유사도 함수에 대해 평가 수행
cosine_mse, cosine_mae = evaluate_similarity(cosine_sim_df)
euclidean_mse, euclidean_mae = evaluate_similarity(euclidean_sim_df)
pearson_mse, pearson_mae = evaluate_similarity(pearson_sim_df)

# 결과 출력
print(f"코사인 유사도 - MSE: {cosine_mse}, MAE: {cosine_mae}")
print(f"유클리드 거리 - MSE: {euclidean_mse}, MAE: {euclidean_mae}")
print(f"피어슨 상관 계수 - MSE: {pearson_mse}, MAE: {pearson_mae}")

코사인 유사도 - MSE: 0.8490741677765032, MAE: 0.6848150750937727
유클리드 거리 - MSE: 0.5553086462879866, MAE: 0.4946830412512544
피어슨 상관 계수 - MSE: 0.8484604934430481, MAE: 0.6829695546809509


# 6. (Optional) Content-Based Recommendation
- 텍스트 데이터를 활용하고 싶어요.
1. 데이터 준비
우선 title, original_title, authors 열을 사용하여 TF-IDF 벡터화를 수행합니다. 만약 original_title이 결측값일 경우, title로 대체할 수 있습니다.

2. TF-IDF 벡터화 및 유사도 계산
텍스트 데이터를 TF-IDF로 벡터화한 후, 코사인 유사도를 계산하여 책 간의 유사도를 측정합니다.

3. 추천 시스템 구현
유사도를 기반으로 특정 책과 유사한 책들을 추천하는 시스템을 구축합니다.

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# title 또는 original_title을 결합하여 텍스트 데이터를 생성
books['combined_text'] = books['title'] + " " + books['original_title'].fillna('') + " " + books['authors']

# Step 1: TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books['combined_text'])

# Step 2: 코사인 유사도를 사용하여 유사도 행렬 계산
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 유사도 행렬을 DataFrame으로 변환 (가독성을 위해 book_id를 인덱스와 컬럼으로 사용)
cosine_sim_df = pd.DataFrame(cosine_sim, index=books['book_id'], columns=books['book_id'])

In [30]:
# Step 3: 특정 책과 유사한 책을 추천하는 함수
def recommend_books(book_id, cosine_sim_df, books_df, top_n=5):
    # 선택한 책과 다른 책들 간의 유사도를 가져옴
    similar_books = cosine_sim_df[book_id].sort_values(ascending=False)

    # 본인을 제외한 상위 N개의 유사한 책을 선택
    top_books = similar_books.iloc[1:top_n+1]

    # 추천할 책의 제목을 반환
    recommended_titles = books_df.loc[books_df['book_id'].isin(top_books.index), 'title']
    return recommended_titles

In [31]:
# 예시: 특정 book_id와 유사한 책 추천
book_id_to_recommend = 1  # 예시로 사용할 book_id (실제로 사용시 적절한 ID로 대체)
recommended_books = recommend_books(book_id_to_recommend, cosine_sim_df, books, top_n=3)
print(f"'{books.loc[books['book_id'] == book_id_to_recommend, 'title'].values[0]}'와 유사한 추천 책들:")
print(recommended_books)

'The Hunger Games (The Hunger Games, #1)'와 유사한 추천 책들:
16    Catching Fire (The Hunger Games, #2)
19       Mockingjay (The Hunger Games, #3)
75                   Sense and Sensibility
Name: title, dtype: object
