# 책 추천 시스템

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

### 0. 데이터

책 데이터

In [2]:
books = pd.read_csv('./data/books/books.csv', encoding='ISO-8859-1')
print(books.shape)
books.head(2)

(10000, 23)


Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...


평점 데이터

In [3]:
ratings = pd.read_csv('./data/books/ratings.csv', encoding='ISO-8859-1')
print(ratings.shape)
ratings.head(2)

(981756, 3)


Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3


책 태그

In [4]:
book_tags = pd.read_csv('./data/books/book_tags.csv', encoding='ISO-8859-1')
print(book_tags.shape)
book_tags.head(2)

(999912, 3)


Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174


태그 정보

In [5]:
tags = pd.read_csv('./data/books/tags.csv')
print(tags.shape)
tags.tail(2)

(34252, 2)


Unnamed: 0,tag_id,tag_name
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


책 태그와 태그정보 merge

In [6]:
tags_join_df = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_df.head(2)

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read


유저가 읽은 책

In [7]:
to_read = pd.read_csv('./data/books/to_read.csv')
print(to_read.shape)
to_read.head(2)

(912705, 2)


Unnamed: 0,user_id,book_id
0,1,112
1,1,235


TF-IDF Vectorize
- authors로 Tfidf 수행

In [8]:
books['authors'][:5]

0                 Suzanne Collins
1    J.K. Rowling, Mary GrandPrÃ©
2                 Stephenie Meyer
3                      Harper Lee
4             F. Scott Fitzgerald
Name: authors, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])
tfidf_matrix

<10000x14742 sparse matrix of type '<class 'numpy.float64'>'
	with 43235 stored elements in Compressed Sparse Row format>

In [10]:
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

linear_kernel을 통한 유사도 측정
- linear_kernel : 코사인 유사도를 구하는 다른 방법

In [11]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

authors 기준 유사한 책 찾기

원하는 책의 인덱스 찾기

In [12]:
titles = books['title']
indices = pd.Series(books.index, index=books['title'])
indices['The Hobbit']

6

해당 책의 유사도 값 호출

In [13]:
cosine_sim[indices['The Hobbit']]

array([0., 0., 0., ..., 0., 0., 0.])

가장 유사한 책의 인덱스 찾기

In [14]:
# 유사도 결과 인덱스를 가진 list형으로 변형
sim_scores = list(enumerate(cosine_sim[indices['The Hobbit']]))
sim_scores = sorted(sim_scores, key=lambda x : x[1], reverse=True)
sim_scores[:3]

[(6, 1.0), (18, 1.0), (154, 1.0)]

작가로 유사한 책 찾기

In [15]:
sim_scores_10 = sim_scores[1: 11]
book_indices = [i[0] for i in sim_scores_10]
titles.iloc[book_indices]

18      The Fellowship of the Ring (The Lord of the Ri...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
188     The Lord of the Rings (The Lord of the Rings, ...
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
610              The Silmarillion (Middle-Earth Universe)
8271                   The Complete Guide to Middle-Earth
1128     The History of the Hobbit, Part One: Mr. Baggins
Name: title, dtype: object

Tag 추가

In [16]:
books_with_tags = pd.merge(books, tags_join_df, left_on='book_id', right_on='goodreads_book_id', how='inner')
books_with_tags.head(2)

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,goodreads_book_id,tag_id,count,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,30574,11314,to-read
1,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,11305,10836,fantasy


TfidfVectorize
- tag

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
tfidf_matrix1

<10000x1381 sparse matrix of type '<class 'numpy.float64'>'
	with 18605 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)
cosine_sim1

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

tag 기준 유사한 책 찾기

추천책 반환 함수

In [19]:
titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])

def tags_recommendations(title, top_n=10):
    idx = indices1[title]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x : x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]

    return titles1.iloc[book_indices]

tags_recommendations('The Hobbit', 15)

16               Catching Fire (The Hunger Games, #2)
31                                    Of Mice and Men
107      Confessions of a Shopaholic (Shopaholic, #1)
125                         Dune (Dune Chronicles #1)
149                                      The Red Tent
206            One for the Money (Stephanie Plum, #1)
214                                  Ready Player One
231               The Gunslinger (The Dark Tower, #1)
253            Shiver (The Wolves of Mercy Falls, #1)
313                           Inkheart (Inkworld, #1)
325                                    White Oleander
405    The New Drawing on the Right Side of the Brain
412                              The Three Musketeers
425                           A Confederacy of Dunces
505                       The One (The Selection, #3)
Name: title, dtype: object

corpus(author + Tag) 추가

각 영화의 태그를 합쳐 문자열로 구성

In [20]:
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head(2)

Unnamed: 0,book_id,tag_name
0,1,to-read fantasy favorites currently-reading yo...
1,2,to-read fantasy favorites currently-reading yo...


books에 merge

In [21]:
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')
books.head(2)

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,to-read fantasy favorites currently-reading yo...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃ©",1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,to-read fantasy favorites currently-reading yo...


저자 이름과 태그 합치기

In [22]:
books['corpus'] = (pd.Series(books[['authors', 'tag_name']].fillna('').values.tolist())).str.join(' ')
books['corpus'][:3]

0    Suzanne Collins to-read fantasy favorites curr...
1    J.K. Rowling, Mary GrandPrÃ© to-read fantasy f...
2    Stephenie Meyer to-read fantasy favorites curr...
Name: corpus, dtype: object

TfidfVectorize
- corpus

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_corpus = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)

titles = books['title']
indices = pd.Series(books.index, index=books['title'])
indices

title
The Hunger Games (The Hunger Games, #1)                                                         0
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)                                        1
Twilight (Twilight, #1)                                                                         2
To Kill a Mockingbird                                                                           3
The Great Gatsby                                                                                4
                                                                                             ... 
Bayou Moon (The Edge, #2)                                                                    9995
Means of Ascent (The Years of Lyndon Johnson, #2)                                            9996
The Mauritius Command                                                                        9997
Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture    9998
The First Worl

In [25]:
def corpus_recommendations(title, top_n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_corpus[idx]))
    sim_scores = sorted(sim_scores, key=lambda x : x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    book_indices = [i[0] for i in sim_scores]

    return titles.iloc[book_indices]

In [26]:
corpus_recommendations('The Hobbit', 15)

188     The Lord of the Rings (The Lord of the Rings, ...
154            The Two Towers (The Lord of the Rings, #2)
160     The Return of the King (The Lord of the Rings,...
18      The Fellowship of the Ring (The Lord of the Ri...
610              The Silmarillion (Middle-Earth Universe)
4975        Unfinished Tales of NÃºmenor and Middle-Earth
2308                               The Children of HÃºrin
963     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
465                             The Hobbit: Graphic Novel
8271                   The Complete Guide to Middle-Earth
1366    The Once and Future King (The Once and Future ...
1321              The Last Unicorn (The Last Unicorn, #1)
53      The Hitchhiker's Guide to the Galaxy (Hitchhik...
367             The Subtle Knife (His Dark Materials, #2)
61            The Golden Compass (His Dark Materials, #1)
Name: title, dtype: object

In [27]:
corpus_recommendations('Twilight (Twilight, #1)', 15)

51                                 Eclipse (Twilight, #3)
48                                New Moon (Twilight, #2)
991                    The Twilight Saga (Twilight, #1-4)
833                         Midnight Sun (Twilight, #1.5)
731     The Short Second Life of Bree Tanner: An Eclip...
1618    The Twilight Saga Complete Collection  (Twilig...
4087    The Twilight Saga: The Official Illustrated Gu...
2020             The Twilight Collection (Twilight, #1-3)
72                                The Host (The Host, #1)
219     Twilight: The Complete Illustrated Movie Compa...
55                           Breaking Dawn (Twilight, #4)
3074    Twilight: The Graphic Novel, Vol. 1 (Twilight:...
1802    The Awakening / The Struggle (The Vampire Diar...
2393    The Fury / Dark Reunion (The Vampire Diaries, ...
418                   Blood Promise (Vampire Academy, #4)
Name: title, dtype: object