In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.0 MB)
[K     |████████████████████████████████| 24.0 MB 91 kB/s eta 0:00:011     |██████████████▏                 | 10.6 MB 2.4 MB/s eta 0:00:06
Collecting smart-open>=1.8.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 434 kB/s eta 0:00:01
[?25hInstalling collected packages: smart-open, gensim
Successfully installed gensim-4.1.2 smart-open-5.2.1


In [3]:
import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import re
from PIL import Image
from io import BytesIO
from nltk.tokenize import RegexpTokenizer
import nltk
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/data.csv", filename="data.csv")
df = pd.read_csv("data.csv")
print('전체 문서의 수 :',len(df))

전체 문서의 수 : 2382


### data preprocess

In [5]:
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

df['cleaned'] = df['Desc'].apply(_removeNonAscii)

In [6]:
def make_lower_case(text):
    return text.lower()
df['cleaned'] = df.cleaned.apply(make_lower_case)
df['cleaned'][:5]

0    we know that power is shifting: from west to e...
1    following the success of the accidental billio...
2    how to tap the power of social software and ne...
3    william j. bernstein is an american financial ...
4    amazing book. and i joined steve jobs and many...
Name: cleaned, dtype: object

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text
df['cleaned'] = df.cleaned.apply(remove_stop_words)
df['cleaned'][:5]

0    know power shifting: west east north south, pr...
1    following success accidental billionaires mone...
2    tap power social software networks build busin...
3    william j. bernstein american financial theori...
4    amazing book. joined steve jobs many akio mori...
Name: cleaned, dtype: object

In [10]:
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text
df['cleaned'] = df.cleaned.apply(remove_punctuation)
df['cleaned'][:5]

0    know power shifting west east north south pres...
1    following success accidental billionaires mone...
2    tap power social software networks build busin...
3    william j bernstein american financial theoris...
4    amazing book joined steve jobs many akio morit...
Name: cleaned, dtype: object

In [11]:
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)
df['cleaned'] = df.cleaned.apply(remove_html)

In [12]:
df['cleaned'][:5]

0    know power shifting west east north south pres...
1    following success accidental billionaires mone...
2    tap power social software networks build busin...
3    william j bernstein american financial theoris...
4    amazing book joined steve jobs many akio morit...
Name: cleaned, dtype: object

In [13]:
df['cleaned'].replace('', np.nan, inplace=True)
df = df[df['cleaned'].notna()]
print('전체 문서의 수 :',len(df))

전체 문서의 수 : 2381


토큰화 --> W2V 훈련

In [14]:
corpus = []
for words in df['cleaned']:
    corpus.append(words.split())

In [22]:
print(len(corpus), len(corpus[0]), len(corpus[1]))

2381 170 160


### 사전훈련된 워드 임베딩 사용
사전 훈련된 워드 임베딩을 단어 벡터 초기값으로 사용하여 성능 높힌다.

In [25]:
# urllib.request.urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", \
#                            filename="GoogleNews-vectors-negative300.bin.gz")

word2vec_model = Word2Vec(sentences=corpus)
word2vec_model.build_vocab(corpus)
# word2vec_model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin.gz', lockf=1.0, binary=True)
word2vec_model.train(corpus, total_examples = word2vec_model.corpus_count, epochs = 15)

(3373612, 3981030)

### 단어 벡터 평균 구하기
각 문서에 존재하는 단어들의 벡터값 평균을 문서의 벡터값으로 사용한다.

In [38]:
def get_document_vectors(document_list):
    document_embedding_list = []

    # 각 문서에 대해서
    for line in document_list:
        doc2vec = None
        count = 0
        for word in line.split():
            if word in word2vec_model.wv:
                count += 1
                # 해당 문서에 있는 모든 단어들의 벡터값을 더한다.
                if doc2vec is None:
                    doc2vec = word2vec_model.wv.key_to_index[word]
                else:
                    doc2vec = doc2vec + word2vec_model.wv.key_to_index[word]

        if doc2vec is not None:
            # 단어 벡터를 모두 더한 벡터의 값을 문서 길이로 나눠준다.
            doc2vec = doc2vec / count
            document_embedding_list.append(doc2vec)

    # 각 문서에 대한 문서 벡터 리스트를 리턴
    return document_embedding_list

In [39]:
document_embedding_list = get_document_vectors(df['cleaned'])
print('문서 벡터의 수 :',len(document_embedding_list))

문서 벡터의 수 : 2381


### 추천 시스템

In [41]:
cosine_similarities = cosine_similarity(document_embedding_list, document_embedding_list)
print('코사인 유사도 매트릭스의 크기 :',cosine_similarities.shape)

ValueError: Expected 2D array, got 1D array instead:
array=[1579.30612245 1652.19285714  754.02564103 ... 1625.42708333 1567.64893617
 1830.        ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
def recommendations(title):
    books = df[['title', 'image_link']]

    # 책의 제목을 입력하면 해당 제목의 인덱스를 리턴받아 idx에 저장.
    indices = pd.Series(df.index, index = df['title']).drop_duplicates()    
    idx = indices[title]

    # 입력된 책과 줄거리(document embedding)가 유사한 책 5개 선정.
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]

    # 가장 유사한 책 5권의 인덱스
    book_indices = [i[0] for i in sim_scores]

    # 전체 데이터프레임에서 해당 인덱스의 행만 추출. 5개의 행을 가진다.
    recommend = books.iloc[book_indices].reset_index(drop=True)

    fig = plt.figure(figsize=(20, 30))

    # 데이터프레임으로부터 순차적으로 이미지를 출력
    for index, row in recommend.iterrows():
        response = requests.get(row['image_link'])
        img = Image.open(BytesIO(response.content))
        fig.add_subplot(1, 5, index + 1)
        plt.imshow(img)
        plt.title(row['title'])

In [None]:
recommendations("The Da Vinci Code")

In [None]:
recommendations("The Murder of Roger Ackroyd")