In [1]:
data = [
    '자바 프로그래밍', 
    '실전 머신러닝', 
    '파이썬 알고리즘 인터뷰',
    '자바 정석',
    '파이썬 프로그래밍 프로그래밍',
    '파이썬 머신러닝 완벽가이드' 
]

## 직접 계산

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [3]:
countvect = vect.fit_transform(data)

In [4]:
import pandas as pd
countvect_df = pd.DataFrame(countvect.toarray(), columns = sorted(vect.vocabulary_))
countvect_df.index = ['문서' + str(i) for i in range(1,len(data)+1)]

# tf
countvect_df

Unnamed: 0,머신러닝,실전,알고리즘,완벽가이드,인터뷰,자바,정석,파이썬,프로그래밍
문서1,0,0,0,0,0,1,0,0,1
문서2,1,1,0,0,0,0,0,0,0
문서3,0,0,1,0,1,0,0,1,0
문서4,0,0,0,0,0,1,1,0,0
문서5,0,0,0,0,0,0,0,1,2
문서6,1,0,0,1,0,0,0,1,0


In [5]:
# df

DF = pd.DataFrame(countvect_df.sum(axis=0)).T
DF.index = ['sum']
DF

Unnamed: 0,머신러닝,실전,알고리즘,완벽가이드,인터뷰,자바,정석,파이썬,프로그래밍
sum,2,1,1,1,1,2,1,3,3


In [6]:
# idf
import numpy as np

IDF = np.log10(len(data)/(1+DF))
IDF = pd.DataFrame(IDF, columns = sorted(vect.vocabulary_))
IDF

Unnamed: 0,머신러닝,실전,알고리즘,완벽가이드,인터뷰,자바,정석,파이썬,프로그래밍
sum,0.30103,0.477121,0.477121,0.477121,0.477121,0.30103,0.477121,0.176091,0.176091


In [7]:
# tf * idf
tf_idf = pd.DataFrame(countvect_df.to_numpy() * IDF.to_numpy(), columns = sorted(vect.vocabulary_))
tf_idf.index = ['문서' + str(i) for i in range(1,len(data)+1)]
tf_idf

Unnamed: 0,머신러닝,실전,알고리즘,완벽가이드,인터뷰,자바,정석,파이썬,프로그래밍
문서1,0.0,0.0,0.0,0.0,0.0,0.30103,0.0,0.0,0.176091
문서2,0.30103,0.477121,0.0,0.0,0.0,0.0,0.0,0.0,0.0
문서3,0.0,0.0,0.477121,0.0,0.477121,0.0,0.0,0.176091,0.0
문서4,0.0,0.0,0.0,0.0,0.0,0.30103,0.477121,0.0,0.0
문서5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176091,0.352183
문서6,0.30103,0.0,0.0,0.477121,0.0,0.0,0.0,0.176091,0.0


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(tf_idf, tf_idf))

[[1.         0.         0.         0.46058588 0.45161406 0.        ]
 [0.         1.         0.         0.         0.         0.27179668]
 [0.         0.         1.         0.         0.11292804 0.07523896]
 [0.46058588 0.         0.         1.         0.         0.        ]
 [0.45161406 0.         0.11292804 0.         1.         0.13325116]
 [0.         0.27179668 0.07523896 0.         0.13325116 1.        ]]


## Module TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
tfvect = vect.fit_transform(data)

In [12]:
type(tfvect)

scipy.sparse.csr.csr_matrix

In [10]:
tfidv_df = pd.DataFrame(tfvect.toarray(), columns = sorted(vect.vocabulary_))
tfidv_df.index = ['문서' + str(i) for i in range(1,len(data)+1)]
tfidv_df

Unnamed: 0,머신러닝,실전,알고리즘,완벽가이드,인터뷰,자바,정석,파이썬,프로그래밍
문서1,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.707107
문서2,0.634086,0.773262,0.0,0.0,0.0,0.0,0.0,0.0,0.0
문서3,0.0,0.0,0.635091,0.0,0.635091,0.0,0.0,0.439681,0.0
문서4,0.0,0.0,0.0,0.0,0.0,0.634086,0.773262,0.0,0.0
문서5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.388903,0.921279
문서6,0.559022,0.0,0.0,0.681722,0.0,0.0,0.0,0.471964,0.0


### 유사도 계산

In [11]:
print(cosine_similarity(tfidv_df, tfidv_df))

[[1.         0.         0.         0.44836665 0.65144232 0.        ]
 [0.         1.         0.         0.         0.         0.35446786]
 [0.         0.         1.         0.         0.17099348 0.20751388]
 [0.44836665 0.         0.         1.         0.         0.        ]
 [0.65144232 0.         0.17099348 0.         1.         0.18354853]
 [0.         0.35446786 0.20751388 0.         0.18354853 1.        ]]


- (문서1, 문서4) = 0.448
    - 문서1: 자바 프로그래밍
    - 문서4: 자바 정석
- (문서1, 문서5) = 0.651
    - 문서1: 자바 프로그래밍
    - 문서5: 파이썬 프로그래밍 프로그래밍
- (문서2, 문서6) = 0.354
    - 문서2: 실전 머신러닝
    - 문서6: 파이썬 머신러닝 완벽가이드
- (문서3, 문서5) = 0.171
    - 문서3: 파이썬 알고리즘 인터뷰
    - 문서5: 파이썬 프로그래밍 프로그래밍
- (문서3, 문서6) = 0.208
    - 문서3: 파이썬 알고리즘 인터뷰
    - 문서6: 파이썬 머신러닝 완벽가이드
- (문서5, 문서6) = 0.184
    - 문서5: 파이썬 프로그래밍 프로그래밍
    - 문서6: 파이썬 머신러닝 완벽가이드