In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

### 코사인 유사도 분석

In [2]:
mirrorDF = pd.read_csv('./project/mirror.csv', index_col=0)
mirrorDF['image'] 

0       ./static/img/bed/mirror/mirror1.png
1       ./static/img/bed/mirror/mirror2.png
2       ./static/img/bed/mirror/mirror3.png
3       ./static/img/bed/mirror/mirror4.png
4       ./static/img/bed/mirror/mirror5.png
                      ...                  
93     ./static/img/bed/mirror/mirror97.png
94     ./static/img/bed/mirror/mirror98.png
95     ./static/img/bed/mirror/mirror99.png
96    ./static/img/bed/mirror/mirror100.png
97    ./static/img/bed/mirror/mirror101.png
Name: image, Length: 98, dtype: object

In [4]:
# 명사 추출
from konlpy.tag import Okt

okt = Okt()

box = []
for i in tqdm(mirrorDF['review']):
    a = okt.nouns(i)
    # 하나의 문장으로 합쳐지게 join 시킴
    box.append(' '.join(a))

mirrorDF['nouns'] = box

100%|██████████| 98/98 [00:16<00:00,  6.02it/s]


In [5]:
mirrorDF.head(1)

Unnamed: 0,brand,item,price,image,link,review,nouns
0,베스트리빙,애슐리 원목 전신거울 7colors,32700,./static/img/bed/mirror/mirror1.png,https://ohou.se/productions/22282/selling?affe...,['노랑색을 좋아해서 5000원정도? 더 비쌋지만 고민하다 노랑으로 샀는데 색을 보...,노랑 색 더 비쌋 고민 노랑 색 보고 선택 생각 드 색깔 제 색 생각 밑 바닥 부분...


In [6]:
# 중간 저장
mirrorDF.to_csv('mirrorDF.csv')

In [7]:
mirrorDF = pd.read_csv('./mirrorDF.csv', index_col=0)
mirrorDF.head(1)

Unnamed: 0,brand,item,price,image,link,review,nouns
0,베스트리빙,애슐리 원목 전신거울 7colors,32700,./static/img/bed/mirror/mirror1.png,https://ohou.se/productions/22282/selling?affe...,['노랑색을 좋아해서 5000원정도? 더 비쌋지만 고민하다 노랑으로 샀는데 색을 보...,노랑 색 더 비쌋 고민 노랑 색 보고 선택 생각 드 색깔 제 색 생각 밑 바닥 부분...


In [8]:
# 단어 카운트 기반의 벡터화
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
dtm = cv.fit_transform(mirrorDF['nouns']) # dtm : 텍스트를 단어의 등장 횟수로 표현된 문서-단어 행렬(document-term matrix)
dtm_mirror = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out()) # dtm.toarray() : 희소행렬인 dtm을 밀집행렬로 변환
dtm_mirror

Unnamed: 0,가갹,가게,가격,가공,가구,가기,가까이,가끔,가나,가넝,...,힌히,힐거,힐때,힐링,힐수,힘틀,힙니,힙스터,힙해,힙해욬
0,0,0,32,0,5,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,30,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,22,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,10,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,0,0,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94,0,0,6,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95,0,0,8,0,0,0,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
96,0,0,4,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# dtm 파일 저장
dtm_mirror.to_csv('dtm_mirror.csv')

---

In [11]:
import pandas as pd 
import numpy as np
from konlpy.tag import Okt
from konlpy.tag import Kkma

mirrorDF = pd.read_csv('./mirrorDF.csv', index_col=0)
dtm_mirror = pd.read_csv('./dtm_mirror.csv', index_col=0)

# 코사인 유사도 : 두 벡터 사이 각도의 코사인 값 (-1 ~ 1)
# a, b : 두 벡터
def make_cosine(a,b):
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

kkma = Kkma()
text = '앉아도 꺼지지 않는 폭식한 소파'
strlist = kkma.nouns(text)
text = [' '.join(strlist)]

result = cv.transform(text) # 위에서 fit 시켰기 때문에 transform만 해주면 됨
input_text = result.toarray()

brand_list = np.array(mirrorDF['brand'])
item_list = np.array(mirrorDF['item'])
price_list = np.array(mirrorDF['price'])
image_list = np.array(mirrorDF['image'])

result_cosine = make_cosine(dtm_mirror, input_text.reshape(-1))
result_args = result_cosine.argsort()[::-1] # argsort() : 배열을 정렬했을 때의 인덱스를 반환하는 함수

# 내림차순 정렬 => [result_args][0] : 코사인 유사도가 가장 큼
print(brand_list[result_args][0], item_list[result_args][0], price_list[result_args][0], image_list[result_args][0])

postershopkr poster-mirror 포레스트 인테리어 거울 34,400 ./static/img/bed/mirror/mirror101.png


  return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [12]:
def find_mirror(text):

    import pandas as pd 
    import numpy as np
    from konlpy.tag import Okt
    from konlpy.tag import Kkma

    mirrorDF = pd.read_csv('./mirrorDF.csv', index_col=0)
    dtm_mirror = pd.read_csv('./dtm_mirror.csv', index_col=0)

    # 코사인 유사도 : 두 벡터 사이 각도의 코사인 값 (-1 ~ 1)
    def make_cosine(a,b):
        return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

    kkma = Kkma()
    search_text = [text]
    strlist = kkma.nouns(search_text[0])
    search_text = [(" ".join(strlist))]

    result = cv.transform(search_text)
    search_words = result.toarray()

    brand_list = np.array(mirrorDF['brand'])
    item_list = np.array(mirrorDF['item'])
    price_list = np.array(mirrorDF['price'])
    image_list = np.array(mirrorDF['image'])

    result_cosine = make_cosine(dtm_mirror, search_words.reshape(-1))

    result_args = result_cosine.argsort()[::-1]
    return brand_list[result_args][0], item_list[result_args][0], price_list[result_args][0], image_list[result_args][0]