In [2]:
import re
import pandas as pd
import numpy as np

In [27]:
doc_ls = ['바나나 사과 포도 포도',
'사과 포도',
'포도 바나나',
'짜장면 짬뽕 탕수욕',
'볶음밥 탕수욕',
'짜장면 짬뽕',
'라면 스시',
'스시',
'가츠동 스시 소바',
'된장찌개 김치찌개 김치',
'김치 된장',
'비빔밥 김치'
]

In [52]:
class LSA :
    def __init__(self, doc_ls, topic_num):
        self. doc_ls = doc_ls
        self.topic_num = topic_num
        self.bow  = list(set(re.sub("'|,|\[|\]", "", str(self.doc_ls)).split(' ')))
        self.U, self.S, self.Vt = self.SVD(self.TDM(self.doc_ls))
        self.u, self.s, self.vt = self.trunc(self.U, self.S, self.Vt)
    # tdm matrix 생성
    def TDM(self, doc_ls):
        
        TDM = pd.DataFrame(np.zeros(( len(self.bow),  len(self.doc_ls))), index= self.bow) 
        a = [doc.split() for doc in self.doc_ls]
        for idx in range(len(a)):
            for i in a[idx]:
                TDM[idx][i] +=1
        return TDM
    
    # tdm matrix 특이값 분해(SVD)
    # U, s, Vt로 분해
    def SVD(self, TDM):
        U, s, Vt = np.linalg.svd(TDM)
        return U, s, Vt
    def trunc(self, U,s,Vt) :
        U_trunc = U[:,:self.topic_num]
        s_trunc = s[:self.topic_num]*np.eye(self.topic_num)
        Vt_trunc = Vt[:,:self.topic_num]
        return U_trunc, s_trunc, Vt_trunc
    # 단어 벡터 행렬 생성 dot(U,s)
    def TopicModeling(self) :
        uDf = pd.DataFrame(self.u, index = self.bow)
        for i in range(topic_num):
            keywords = dict(uDf.sort_values(by = i, ascending=False))
            print('Topic{}-{}'.format(i,keywords))
            
    # 단어 벡터 행렬 생성 dot(U,s)
    def TermVectorMatrix(self, u, s):
        wordsVec  = np.dot(u,s)
        return wordsVec
    
    # 문서 벡터 행렬 생성 dot(s,Vt).T
    def DocVectorMatrix(self, s, vt):
        docsVec = np.dot(vt, s)
        return docVec
    
    # 키워드를 입력했을 때 단어 벡터 반환
    def GetTermVector(self, term):
#         u, s, vt = self.TopicModeling(self.SVD(self.TDM(self.doc_ls)))
        wordsVec = self.TermVectorMatrix(self.u, self.s)
        return wordsVec[self.bow.index(term)]
    # 문서를 입력했을 때 문서 벡터 반환
    def GetDocVector(self, doc):
#         u, s, vt = self.TopicModeling(self.SVD(self.TDM(self.doc_ls)))
        docsVec = self.DocVectorMatrix(self.s, self.vt)
        return docsVec[self.doc_ls.index(doc)]
    # 단어-문서 벡터 행렬 생성
    def TermDocVectorMatrix(self, u, s, vt):
        wordsVec = self.TermVectorMatrix(self.u, self.s)
        docsVec = self.DocVectorMatrix(self.s, self.vt)
        return np.dot(wordsVec,docsVec.T)
        
    # 단어 벡터 행렬에서 단어 간 코사인 유사도 측정하여 행렬형태로 반환
    def TermSimilarityMatrix(self, term_vec_matrix):
        term_vec_matrix = self.TermVectorMatrix(self.u, self.s)
        termSim = np.zeros(len(self.bow),len(self.bow))
        
        for vec1 in term_vec_matrix:
            for vec2 in term_vec_matrix:
                termSim[vec1][vec2] = np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))
        return termSim
    
    # 두개 단어를 입력했을 때 코사인 유사도 반환
    def GetTermSimilarity(self, term1, term2):
        termSim = self.TermSimilarityMatrix(self.TermVectorMatrix(self.u, self.s))
        termSimDf = pd.DataFrame(termSim, columns=self.bow, index= self.bow)
        return termSimDf[term1][term2]
        
    # 문서 벡터 행렬에서 문서 간 코사인 유사도 측정하여 행렬형태로 반환
    def DocSimilarityMartrix(self, doc_vec_matrix):
        doc_vec_matrix = self.DocVectorMatrix(self.s, self.vt)
        docSim = np.zeros(len(self.doc_ls),len(self.doc_ls))
        
        for vec1 in doc_vec_matrix:
            for vec2 in doc_vec_matrix:
                docSim[vec1][vec2] = np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))
        return docSim
        
    # 두개 문서를 입력했을 때 코사인 유사도 반환
    def GetDocSimilarity(self, doc1, doc2):
        docSim = self.DocSimilarityMatrix(self.DocVectorMatrix(self.s, self.vt))
        docSimDf = pd.DataFrame(docSim, columns=self.doc_ls, index= self.doc_ls)
        return docSimDf[doc1][doc2]

In [53]:
lsa = LSA(doc_ls, 4)
lsa.TopicModeling()
lsa.GetTermSimilarity('사과','바나나')
lsa.GetTermSimilarity('사과','짜장면')
lsa.GetDocSimilarity('사과 포도', '포도 바나나')
lsa.GetDocSimilarity('사과 포도', '라면 스시')
lsa.GetTermDocSimilarity('사과', '포도 바나나')
lsa.GetTermDocSimilarity('사과', '김치 된장')

Topic0-{0: 된장찌개    0.000000
김치찌개    0.000000
김치      0.000000
스시      0.000000
짬뽕      0.000000
라면      0.000000
된장      0.000000
탕수욕     0.000000
비빔밥     0.000000
소바      0.000000
가츠동     0.000000
짜장면     0.000000
볶음밥     0.000000
바나나    -0.408248
사과     -0.408248
포도     -0.816497
Name: 0, dtype: float64, 1: 된장찌개   -5.023430e-17
김치찌개   -5.415803e-17
김치      4.016462e-17
스시     -4.315491e-16
짬뽕      6.120248e-01
라면     -1.804112e-16
된장      1.799156e-17
탕수욕     4.842878e-01
비빔밥     3.013463e-17
소바     -1.821460e-16
가츠동    -1.821460e-16
짜장면     6.120248e-01
볶음밥     1.277370e-01
바나나    -2.002163e-16
사과      2.537639e-16
포도     -2.677378e-17
Name: 1, dtype: float64, 2: 된장찌개    3.365568e-01
김치찌개    3.365568e-01
김치      8.125199e-01
스시      0.000000e+00
짬뽕     -7.314862e-18
라면      0.000000e+00
된장      2.379816e-01
탕수욕     0.000000e+00
비빔밥     2.379816e-01
소바      0.000000e+00
가츠동     0.000000e+00
짜장면    -7.314862e-18
볶음밥     0.000000e+00
바나나     0.000000e+00
사과      2.775558e-17
포도      5.

TypeError: data type not understood

# TDM 생성

# 특이값 분해 (SVD)

# 토픽 모델링

# 단어 벡터 행렬 생성

In [13]:
np.dot(U_trunc,s_trunc).shape

(16, 3)

# 문서 벡터 행렬 생성

In [14]:
docsVec = np.dot(Vt_trunc, s_trunc)

# 키워드를 입력했을 때 단어 벡터 반환

# 문서를 입력했을 때 문서 벡터 반환

# 단어-문서 벡터 행렬 생성

In [3]:
doc_ls = ['바나나 사과 포도 포도',
'사과 포도',
'포도 바나나',
'짜장면 짬뽕 탕수욕',
'볶음밥 탕수욕',
'짜장면 짬뽕',
'라면 스시',
'스시',
'가츠동 스시 소바',
'된장찌개 김치찌개 김치',
'김치 된장',
'비빔밥 김치'
]

In [4]:
bow  = list(set(re.sub("'|,|\[|\]", "", str(doc_ls)).split(' ')))
TDM = pd.DataFrame(np.zeros(( len(bow),  len(doc_ls))), index= bow) 
a = [doc.split() for doc in doc_ls]
for idx in range(len(a)):
    for i in a[idx]:
        TDM[idx][i] +=1

In [6]:
U, s, Vt = np.linalg.svd(TDM)

In [11]:
topic_num = 3
U_trunc = U[:,:topic_num]
s_trunc = s[:topic_num]*np.eye(topic_num)
Vt_trunc = Vt[:,:topic_num]

In [15]:
term = '사과'

wordsVec[bow.index(term)]

array([-1.22474487e+00,  5.55464065e-16,  5.83145473e-17])

In [17]:
np.dot(wordsVec,docsVec.T).shape

(16, 12)

# 단어 벡터 행렬에서 단어 간 코사인 유사도 측정하여 행렬형태로 반환

In [5]:
TDM

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
된장찌개,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
김치찌개,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
김치,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
스시,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
사과,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
짬뽕,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
포도,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
라면,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
된장,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
탕수욕,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
doc = '볶음밥 탕수욕'

docsVec[doc_ls.index(doc)]

array([-7.25745775e-17, -2.23888314e-16,  2.91683973e-16])

In [21]:
def cosine_similarity(x,y):
    nominator = np.dot(x,y)
    denominator = np.linalg.norm(x)*np.linalg.norm(y)
    return nominator/ denominator

In [26]:
wordsVec


array([[ 0.00000000e+00, -1.09957912e-16,  7.07106781e-01],
       [ 0.00000000e+00, -1.18546580e-16,  7.07106781e-01],
       [ 0.00000000e+00,  8.79163811e-17,  1.70710678e+00],
       [ 0.00000000e+00, -9.44618288e-16,  0.00000000e+00],
       [-1.22474487e+00,  5.55464065e-16,  5.83145473e-17],
       [ 0.00000000e+00,  1.33966166e+00, -1.53685463e-17],
       [-2.44948974e+00, -5.86051641e-17,  1.16629095e-16],
       [ 0.00000000e+00, -3.94902358e-16,  0.00000000e+00],
       [ 0.00000000e+00,  3.93817463e-17,  5.00000000e-01],
       [ 0.00000000e+00,  1.06005799e+00,  0.00000000e+00],
       [ 0.00000000e+00,  6.59617127e-17,  5.00000000e-01],
       [ 0.00000000e+00, -3.98699496e-16,  0.00000000e+00],
       [ 0.00000000e+00, -3.98699496e-16,  0.00000000e+00],
       [-1.22474487e+00, -4.38253737e-16,  0.00000000e+00],
       [ 0.00000000e+00,  1.33966166e+00, -1.53685463e-17],
       [ 0.00000000e+00,  2.79603668e-01,  0.00000000e+00]])

# 두개 단어를 입력했을 때 코사인 유사도 반환

# 문서 벡터 행렬에서 문서 간 코사인 유사도 측정하여 행렬형태로 반환

# 두개 문서를 입력했을 때 코사인 유사도 반환

In [24]:
pd.Series(wordsVec).map( cosine_similarity(x, y))

Exception: Data must be 1-dimensional

In [None]:
dd

In [19]:
wordsvec

NameError: name 'wordsvec' is not defined