In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

### 코사인 유사도 분석

In [2]:
ovenstDF = pd.read_csv('./project/oven_stand.csv', index_col=0)
ovenstDF['image'] 

0        ./static/img/kitchen/oven_stand/ovst1.png
1        ./static/img/kitchen/oven_stand/ovst2.png
2        ./static/img/kitchen/oven_stand/ovst3.png
3        ./static/img/kitchen/oven_stand/ovst4.png
4        ./static/img/kitchen/oven_stand/ovst5.png
                          ...                     
100    ./static/img/kitchen/oven_stand/ovst101.png
101    ./static/img/kitchen/oven_stand/ovst102.png
102    ./static/img/kitchen/oven_stand/ovst103.png
103    ./static/img/kitchen/oven_stand/ovst104.png
104    ./static/img/kitchen/oven_stand/ovst105.png
Name: image, Length: 105, dtype: object

In [4]:
# 명사 추출
from konlpy.tag import Okt

okt = Okt()

box = []
for i in tqdm(ovenstDF['review']):
    a = okt.nouns(i)
    # 하나의 문장으로 합쳐지게 join 시킴
    box.append(' '.join(a))

ovenstDF['nouns'] = box

100%|██████████| 105/105 [00:20<00:00,  5.00it/s]


In [5]:
ovenstDF.head(1)

Unnamed: 0,brand,item,price,image,link,review,nouns
0,가구레시피,국내생산 조립식 시그니처 진열장형 3단 전자렌지대 다용도수납장 밥솥다이,27100,./static/img/kitchen/oven_stand/ovst1.png,https://ohou.se/productions/258147/selling?aff...,['전자레인지 놓기에 딱이에요! 주방 공간이 협소해서 전자레인지 따로 놓을 거치대로...,전자레인지 주방 공간 협소해 전자레인지 거치 주문 사이즈 아래 공간 꽤 걸 둘 수 ...


In [6]:
# 중간 저장
ovenstDF.to_csv('ovenstDF.csv')

In [7]:
ovenstDF = pd.read_csv('./ovenstDF.csv', index_col=0)
ovenstDF.head(1)

Unnamed: 0,brand,item,price,image,link,review,nouns
0,가구레시피,국내생산 조립식 시그니처 진열장형 3단 전자렌지대 다용도수납장 밥솥다이,27100,./static/img/kitchen/oven_stand/ovst1.png,https://ohou.se/productions/258147/selling?aff...,['전자레인지 놓기에 딱이에요! 주방 공간이 협소해서 전자레인지 따로 놓을 거치대로...,전자레인지 주방 공간 협소해 전자레인지 거치 주문 사이즈 아래 공간 꽤 걸 둘 수 ...


In [8]:
# 단어 카운트 기반의 벡터화
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
dtm = cv.fit_transform(ovenstDF['nouns']) # dtm : 텍스트를 단어의 등장 횟수로 표현된 문서-단어 행렬(document-term matrix)
dtm_ovenst = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out()) # dtm.toarray() : 희소행렬인 dtm을 밀집행렬로 변환
dtm_ovenst

Unnamed: 0,가가,가게,가격,가관,가구,가구라,가구점,가기,가까이,가끔,...,히닜,히든,히딧밑,힌지,힐거,힐때,힐링,힘겨웟던,힘드셧을텐데,힘방
0,0,0,19,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,18,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,11,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2,11,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,0,0,14,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
101,0,0,2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
102,0,0,10,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103,0,0,17,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# dtm 파일 저장
dtm_ovenst.to_csv('dtm_ovenst.csv')

---

In [11]:
import pandas as pd 
import numpy as np
from konlpy.tag import Okt
from konlpy.tag import Kkma

ovenstDF = pd.read_csv('./ovenstDF.csv', index_col=0)
dtm_ovenst = pd.read_csv('./dtm_ovenst.csv', index_col=0)

# 코사인 유사도 : 두 벡터 사이 각도의 코사인 값 (-1 ~ 1)
# a, b : 두 벡터
def make_cosine(a,b):
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

kkma = Kkma()
text = '앉아도 꺼지지 않는 폭식한 소파'
strlist = kkma.nouns(text)
text = [' '.join(strlist)]

result = cv.transform(text) # 위에서 fit 시켰기 때문에 transform만 해주면 됨
input_text = result.toarray()

brand_list = np.array(ovenstDF['brand'])
item_list = np.array(ovenstDF['item'])
price_list = np.array(ovenstDF['price'])
image_list = np.array(ovenstDF['image'])

result_cosine = make_cosine(dtm_ovenst, input_text.reshape(-1))
result_args = result_cosine.argsort()[::-1] # argsort() : 배열을 정렬했을 때의 인덱스를 반환하는 함수

# 내림차순 정렬 => [result_args][0] : 코사인 유사도가 가장 큼
print(brand_list[result_args][0], item_list[result_args][0], price_list[result_args][0], image_list[result_args][0])

필웰 RZ9454 데이아 광파오븐 렌지대600(서랍) DVX 99,800 ./static/img/kitchen/oven_stand/ovst14.png


In [12]:
# 위의 코드 함수화
def find_ovenst(text):

    import pandas as pd 
    import numpy as np
    from konlpy.tag import Okt
    from konlpy.tag import Kkma

    ovenstDF = pd.read_csv('./ovenstDF.csv', index_col=0)
    dtm_ovenst = pd.read_csv('./dtm_ovenst.csv', index_col=0)

    # 코사인 유사도 : 두 벡터 사이 각도의 코사인 값 (-1 ~ 1)
    def make_cosine(a,b):
        return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

    kkma = Kkma()
    search_text = [text]
    strlist = kkma.nouns(search_text[0])
    search_text = [(" ".join(strlist))]

    result = cv.transform(search_text)
    search_words = result.toarray()

    brand_list = np.array(ovenstDF['brand'])
    item_list = np.array(ovenstDF['item'])
    price_list = np.array(ovenstDF['price'])
    image_list = np.array(ovenstDF['image'])

    result_cosine = make_cosine(dtm_ovenst, search_words.reshape(-1))

    result_args = result_cosine.argsort()[::-1]
    return brand_list[result_args][0], item_list[result_args][0], price_list[result_args][0], image_list[result_args][0]