## 카운트 방식의 텍스트 유사도 계산

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd

In [None]:
text = [
   'The sky is blue.',             # ‘sky’, ‘blue’       
   'The sun is bright.',          # ‘sun’, ‘bright’     
   'The sun in the sky is bright',    # ‘sun’, ‘sky’, ‘bright’
   'We can see the shining sun, the bright sun.'  # ‘shining’, ‘sun’, ‘bright’ 
]

In [None]:
countvectorizer = CountVectorizer(analyzer='word', stop_words='english')

In [None]:
count_wm = countvectorizer.fit_transform(text)
print(count_wm.shape)
print(count_wm.todense()) # todense -> 메모리 절약을 위해 행렬의 대부분이 비어있는데, 우리 눈에는 그게 보기 힘들다. 그래서 todense를 사용해 행렬을 우리가 이해하기 쉬운 형태로 볼 수 있다.  
#행이 4개, 열이 5개 / 행이 4개인 이유는 문서가 4개이고, 인식된 단어의 수가 5개 이므로

(4, 5)
[[1 0 0 1 0]
 [0 1 0 0 1]
 [0 1 0 1 1]
 [0 1 1 0 2]]


In [None]:
count_tokens = countvectorizer.get_feature_names()
print(count_tokens)
df_countvect = pd.DataFrame(data = count_wm.toarray(), index = ['Doc1','Doc2', 'Doc3', 'Doc4'], columns = count_tokens) # 데이터프레임으로 표현하기 위해 toarray사용.
df_countvect.head()

['blue', 'bright', 'shining', 'sky', 'sun']




Unnamed: 0,blue,bright,shining,sky,sun
Doc1,1,0,0,1,0
Doc2,0,1,0,0,1
Doc3,0,1,0,1,1
Doc4,0,1,1,0,2


In [None]:
from sklearn.metrics.pairwise import cosine_similarity # 코사인 유사도 import

cosine_similarities = cosine_similarity(count_wm)
print(cosine_similarities)
# 대각선은 1(자기와 자기 비교)


[[1.         0.         0.40824829 0.        ]
 [0.         1.         0.81649658 0.8660254 ]
 [0.40824829 0.81649658 1.         0.70710678]
 [0.         0.8660254  0.70710678 1.        ]]


In [None]:
pd.DataFrame(data = cosine_similarities, index = ['Doc1','Doc2', 'Doc3', 'Doc4'], columns = ['Doc1','Doc2', 'Doc3', 'Doc4']) # 알아보기 좋게 표현

Unnamed: 0,Doc1,Doc2,Doc3,Doc4
Doc1,1.0,0.0,0.408248,0.0
Doc2,0.0,1.0,0.816497,0.866025
Doc3,0.408248,0.816497,1.0,0.707107
Doc4,0.0,0.866025,0.707107,1.0


## TF-IDF 방식의 텍스트 유사도 계산

TF-IDF = TF(t, d) * IDF(t)

*   TF(t, d): 단어 t가 문서 d에서 몇번 나왔나?
*   IDF(t): 단어가 t가 전체 문서들중 몇개의 문서에서 나왔는지? Ln(N/DF)
    * N은 총 문서수를 나타내고 DF는 단어가 나온 문서를 말한다

기본적인 아이디어는 전체 문서에서 빈도수가 낮은 단어가 특정 문서에서 많이 나오는 경우 해당 문서에서 해당 단어의 점수가 더 높게 계산됨



In [None]:
tfidfvectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

In [None]:
tfidf_wm = tfidfvectorizer.fit_transform(text)
print(tfidf_wm.shape)
print(tfidf_wm.todense())

(4, 5)
[[0.78528828 0.         0.         0.6191303  0.        ]
 [0.         0.70710678 0.         0.         0.70710678]
 [0.         0.53256952 0.         0.65782931 0.53256952]
 [0.         0.36626037 0.57381765 0.         0.73252075]]


In [None]:
tfidf_tokens = tfidfvectorizer.get_feature_names()
print(tfidf_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(), index = ['Doc1','Doc2', 'Doc3', 'Doc4'], columns = tfidf_tokens)
df_tfidfvect.head()

['blue', 'bright', 'shining', 'sky', 'sun']




Unnamed: 0,blue,bright,shining,sky,sun
Doc1,0.785288,0.0,0.0,0.61913,0.0
Doc2,0.0,0.707107,0.0,0.0,0.707107
Doc3,0.0,0.53257,0.0,0.657829,0.53257
Doc4,0.0,0.36626,0.573818,0.0,0.732521


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(tfidf_wm)
print(cosine_similarities) # 유사도 계산
# 대각선은 1(동일한 것 끼리 비교)

[[1.         0.         0.40728206 0.        ]
 [0.         1.         0.75316704 0.77695558]
 [0.40728206 0.75316704 1.         0.58517734]
 [0.         0.77695558 0.58517734 1.        ]]


In [None]:
pd.DataFrame(data = cosine_similarities, index = ['Doc1','Doc2', 'Doc3', 'Doc4'], columns = ['Doc1','Doc2', 'Doc3', 'Doc4']) # 알아보기 좋게 표현

Unnamed: 0,Doc1,Doc2,Doc3,Doc4
Doc1,1.0,0.0,0.407282,0.0
Doc2,0.0,1.0,0.753167,0.776956
Doc3,0.407282,0.753167,1.0,0.585177
Doc4,0.0,0.776956,0.585177,1.0
