# 1. Goodbooks-10k 
- Link : https://www.kaggle.com/zygmunt/goodbooks-10k

In [None]:
import pandas as pd
import numpy as np
import plotnine 
from plotnine import *
import os, sys, gc
from tqdm.notebook import tqdm
import warnings 
warnings.filterwarnings('ignore')

In [None]:
path = '../input/t-academy-recommendation2/books/'
print(os.listdir(path))

In [None]:
books = pd.read_csv(path + "books.csv")
book_tags = pd.read_csv(path + "book_tags.csv")
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
tags = pd.read_csv(path + "tags.csv")
to_read = pd.read_csv(path + "to_read.csv")

In [None]:
train['book_id'] = train['book_id'].astype(str)
test['book_id'] = test['book_id'].astype(str)
books['book_id'] = books['book_id'].astype(str)

In [None]:
popular_rec_model = books.sort_values(by='books_count', ascending=False)['book_id'].values[0:500]

In [None]:
sol = test.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

In [None]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()

## TF-IDF를 이용한 Contents Based Model 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['title'])
print(tfidf_matrix.shape)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

In [None]:
# book title와 id를 매핑할 dictionary를 생성해줍니다. 
book2id = {}
for i, c in enumerate(books['title']): book2id[i] = c

# id와 book title를 매핑할 dictionary를 생성해줍니다. 
id2book = {}
for i, c in book2id.items(): id2book[c] = i
    
# book_id와 title를 매핑할 dictionary를 생성해줍니다.
bookid2book = {}
for i, j in zip(books['title'].values, books['book_id'].values):
    bookid2book[i] = j

In [None]:
books['title'].head()

In [None]:
idx = id2book['Twilight (Twilight, #1)']  
sim_scores = [(book2id[i], c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
sim_scores[0:10] 

0. 학습셋에서 제목이 있는 경우에 대해서만 진행
1. 각 유저별로 읽은 책의 목록을 수집 
2. 읽은 책과 유사한 책 추출 
3. 모든 책에 대해서 유사도를 더한 값을 계산 
4. 3에서 유사도가 가장 높은 순서대로 추출 

In [None]:
train = pd.merge(train, books[['book_id', 'title']], how='left', on='book_id')
train.head()

In [None]:
# 0. 학습셋에서 제목이 있는 경우에 대해서만 진행
tf_train = train[train['title'].notnull()].reset_index(drop=True)
tf_train['idx2title'] = tf_train['title'].apply(lambda x: id2book[x])
tf_train.head()

In [None]:
idx2title2book = {}
for i, j in zip(tf_train['idx2title'].values, tf_train['book_id'].values):
    idx2title2book[i] = j

In [None]:
# 1. 각 유저별로 읽은 책의 목록을 수집 
user = 7
read_list = tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()
seen = read_list[read_list['user_id'] == user]['unique'].values[0]
seen

In [None]:
# 2. 읽은 책과 유사한 책 추출 
## 343번째 책과 다른 책들간의 유사도 
cosine_matrix[343]

In [None]:
# 2. 읽은 책과 유사한 책 추출 
total_cosine_sim = np.zeros(len(book2id))
for book_ in seen: 
    # 3. 모든 책에 대해서 유사도를 더한 값을 계산 
    # 343번째 책과 248의 유사도가 모두 결합된 유사도
    total_cosine_sim += cosine_matrix[book_]

In [None]:
# 4. 3에서 유사도가 가장 높은 순서대로 추출
sim_scores = [(i, c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬 
sim_scores[0:5]

In [None]:
book2id[4809]

In [None]:
bookid2book[book2id[4809]]

In [None]:
tf_train['user_id'].unique()

In [None]:
tf_train.head()

In [None]:
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2 = tf_train.groupby(['user_id'])['idx2title'].agg({'unique'}).reset_index()

for user in tqdm(train['user_id'].unique()):
    rec_list = []
        
    # 만약 TF-IDF 소속의 추천대상이라면 Contents 기반의 추천 
    if user in tf_train['user_id'].unique():
        # 1. 각 유저별로 읽은 책의 목록을 수집 
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        # 2. 읽은 책과 유사한 책 추출 
        total_cosine_sim = np.zeros(len(book2id))
        for book_ in seen: 
            # 3. 모든 책에 대해서 유사도를 더한 값을 계산 
            # 343번째 책과 248의 유사도가 모두 결합된 유사도
            total_cosine_sim += cosine_matrix[book_]
            
        # 4. 3에서 유사도가 가장 높은 순서대로 추출
        sim_scores = [(bookid2book[book2id[i]], c) for i, c in enumerate(total_cosine_sim) if i not in seen] # 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출 
        recs = sorted(sim_scores, key = lambda x: x[1], reverse=True)[0:300] # 유사도가 높은 순서대로 정렬 
        for rec in recs: 
            if rec not in seen:
                rec_list.append(rec)   
        
    # 그렇지 않으면 인기도 기반의 추천 
    else: 
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:400]:
            if rec not in seen:
                rec_list.append(rec)
                
    total_rec_list[user] = rec_list[0:200]

In [None]:
import six
import math

# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

 ## Word2vec을 이용한 추천시스템 
 - Tag간의 유사도 
 - 제목간의 유사도 
 - 책의 읽은 순서를 통한 유사도 

In [None]:
agg = train.groupby(['user_id'])['book_id'].agg({'unique'})
agg.head()

In [None]:
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [None]:
# Word2vec의 학습을 진행해줍니다. 
from gensim.models import Word2Vec
embedding_model = Word2Vec(sentence, size=20, window = 5, 
                           min_count=1, workers=4, iter=200, sg=1)

In [None]:
embedding_model.wv.most_similar(positive=['4893'], topn=10)

In [None]:
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []     
    seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
    word2vec_dict = {}
    for book in seen: 
        for i in embedding_model.wv.most_similar(positive=[book], topn=300):
            if i[0] not in seen: 
                if i[0] not in word2vec_dict.keys(): 
                    word2vec_dict[i[0]] = i[1]
                else:
                    word2vec_dict[i[0]] += i[1]
                
    rec_list = list(dict(sorted(word2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    total_rec_list[user] = rec_list[0:200]

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()

### 태그를 통한 유사도 계산 

In [None]:
book_tags.columns = ['book_id', 'tag_id', 'count']
book_tags['book_id'] = book_tags['book_id'].astype(str)
book_tags['tag_id'] = book_tags['tag_id'].astype(str)

tags['tag_id'] = tags['tag_id'].astype(str)

book_tags = pd.merge(book_tags, tags, how='left', on='tag_id')
book_tags.head()

In [None]:
agg = book_tags.groupby(['book_id'])['tag_name'].agg({'unique'}).reset_index()
agg.head()

In [None]:
# 태그간의 유사도 계산 
# int형식은 Word2vec에서 학습이 안되어서 String으로 변경해줍니다. 
sentence = []
for user_sentence in agg['unique'].values:
    sentence.append(list(map(str, user_sentence)))

In [None]:
from gensim.models import doc2vec
doc_vectorizer = doc2vec.Doc2Vec(
    dm=0,            # PV-DBOW / default 1
    dbow_words=1,    # w2v simultaneous with DBOW d2v / default 0
    window=10,        # distance between the predicted word and context words
    size=100,        # vector size
    alpha=0.025,     # learning-rate
    seed=1234,
    min_count=5,    # ignore with freq lower
    min_alpha=0.025, # min learning-rate
    workers=4,   # multi cpu
    hs = 1,          # hierar chical softmax / default 0
    negative = 10   # negative sampling / default 5
)

In [None]:
from collections import namedtuple

TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(c, [d]) for c, d in agg[['unique', 'book_id']].values]

In [None]:
doc_vectorizer.build_vocab(tagged_train_docs)
print(str(doc_vectorizer))

In [None]:
# 벡터 문서 학습
from time import time

start = time()

for epoch in tqdm(range(5)):
    doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002 # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay

#doc_vectorizer.train(tagged_train_docs, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
end = time()
print("During Time: {}".format(end-start))

In [None]:
doc_vectorizer.docvecs.most_similar('1', topn=20)

In [None]:
train.head()

In [None]:
# tag 정보가 있는 책이 있고 아닌 책이 있어서 해당 책만 추출 
agg['type'] = '1'
train = pd.merge(train, agg, how='left', on='book_id')

In [None]:
## 전체 영화에 대해서 진행 
total_rec_list = {}

read_list1 = train.groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
read_list2 = train[train['type'] == '1'].groupby(['user_id'])['book_id'].agg({'unique'}).reset_index()
for user in tqdm(train['user_id'].unique()):
    rec_list = []
    if user in read_list2['user_id'].unique():
        seen = read_list2[read_list2['user_id'] == user]['unique'].values[0]
        doc2vec_dict = {}
        for book in seen: 
            for i in doc_vectorizer.docvecs.most_similar(positive=[book], topn=300): 
                if i[0] not in doc2vec_dict.keys(): 
                    doc2vec_dict[i[0]] = i[1]
                else:
                    doc2vec_dict[i[0]] += i[1]

        rec_list = list(dict(sorted(doc2vec_dict.items(), key = lambda x: x[1], reverse=True)).keys())
    else:
        
        seen = read_list1[read_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:300]:
            if rec not in seen:
                rec_list.append(rec)
    total_rec_list[user] = rec_list[0:200]

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=200)
evaluate_func._evaluate()