## 카운트 방식의 텍스트 유사도 계산

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd

In [11]:
text = [
    "The see is blue.",  # ‘sky’, ‘blue’
    "The sky is bright.",  # ‘sun’, ‘bright’
    "The sky in the sea is bright",  # ‘sun’, ‘sky’, ‘bright’
    "We can see the shining see, the bright sky .",  # ‘ see’, ‘shining’, ‘sun’, ‘bright’
]

In [12]:
countvectorizer = CountVectorizer(analyzer="word", stop_words="english")

In [13]:
count_wm = countvectorizer.fit_transform(text)
print(count_wm.shape)
print(count_wm.todense())

(4, 5)
[[1 0 0 0 0]
 [0 1 0 0 1]
 [0 1 1 0 1]
 [0 1 0 1 1]]


In [16]:
count_tokens = countvectorizer.get_feature_names_out()
print(count_tokens)
df_countvect = pd.DataFrame(
    data=count_wm.toarray(),
    index=["S1", "S2", "S3", "S4"],
    columns=count_tokens,
)
df_countvect.head()

['blue' 'bright' 'sea' 'shining' 'sky']


Unnamed: 0,blue,bright,sea,shining,sky
S1,1,0,0,0,0
S2,0,1,0,0,1
S3,0,1,1,0,1
S4,0,1,0,1,1


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(count_wm)
print(cosine_similarities)

[[1.         0.         0.         0.        ]
 [0.         1.         0.81649658 0.81649658]
 [0.         0.81649658 1.         0.66666667]
 [0.         0.81649658 0.66666667 1.        ]]


In [19]:
pd.DataFrame(
    data=cosine_similarities,
    index=["S1", "S2", "S3", "S4"],
    columns=["S1", "S2", "S3", "S4"],
)

Unnamed: 0,S1,S2,S3,S4
S1,1.0,0.0,0.0,0.0
S2,0.0,1.0,0.816497,0.816497
S3,0.0,0.816497,1.0,0.666667
S4,0.0,0.816497,0.666667,1.0


## TF-IDF 방식의 텍스트 유사도 계산

$TF-IDF$ = $TF(t, d) \times IDF(t)$

* $TF(t, d)$: 단어 $t$가 문서 $d$에서 몇번 나왔나?
* $IDF(t)$: 단어가 $t$가 전체 문서들 중 몇개의 문서에서 나왔는지? $Ln(N/DF)$
    * $N$은 총 문서수를 나타내고 $D$F는 단어가 나온 문서
    
기본적인 아이디어는 전체 문서에서 빈도수가 낮은 단어가 특정 문서에서 많이 나오는 경우 해당 문서에서 해당 단어의 점수가 더 높게 계산됨

In [20]:
tfidfvectorizer = TfidfVectorizer(analyzer="word", stop_words="english")

In [21]:
tfidf_wm = tfidfvectorizer.fit_transform(text)
print(tfidf_wm.shape)
print(tfidf_wm.todense())

(4, 5)
[[1.         0.         0.         0.         0.        ]
 [0.         0.70710678 0.         0.         0.70710678]
 [0.         0.47380449 0.74230628 0.         0.47380449]
 [0.         0.47380449 0.         0.74230628 0.47380449]]


In [23]:
tfidf_tokens = tfidfvectorizer.get_feature_names_out()
print(tfidf_tokens)
df_tfidfvect = pd.DataFrame(
    data=tfidf_wm.toarray(),
    index=["S1", "S2", "S3", "S4"],
    columns=tfidf_tokens,
)
df_tfidfvect.head()

['blue' 'bright' 'sea' 'shining' 'sky']


Unnamed: 0,blue,bright,sea,shining,sky
S1,1.0,0.0,0.0,0.0,0.0
S2,0.0,0.707107,0.0,0.0,0.707107
S3,0.0,0.473804,0.742306,0.0,0.473804
S4,0.0,0.473804,0.0,0.742306,0.473804


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(tfidf_wm)
print(cosine_similarities)

[[1.         0.         0.         0.        ]
 [0.         1.         0.67006073 0.67006073]
 [0.         0.67006073 1.         0.44898138]
 [0.         0.67006073 0.44898138 1.        ]]


In [25]:
pd.DataFrame(
    data=cosine_similarities,
    index=["S1", "S2", "S3", "S4"],
    columns=["S1", "S2", "S3", "S4"],
)

Unnamed: 0,S1,S2,S3,S4
S1,1.0,0.0,0.0,0.0
S2,0.0,1.0,0.670061,0.670061
S3,0.0,0.670061,1.0,0.448981
S4,0.0,0.670061,0.448981,1.0
