In [1]:
import pandas as pd
from math import log

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

#  pd : 1.1.5  |  sklearn : 0.22.2.post1
print(f'>>> pd : {pd.__version__}  |  sklearn : {sklearn.__version__}')

>>> pd : 1.1.5  |  sklearn : 0.22.2.post1


## TF-IDF(Term Frequency-Inverse Document Frequency)
- TF-IDF : 단어의 빈도와 역 문서 빈도(문서의 빈도에 특정 식을 취함)를 사용하여 DTM 내의 각 단어들마다 중요한 정도를 가중치로 주는 방법
- 주로 문서의 유사도를 구하는 작업, 검색 시스템에서 검색 결과의 중요도를 정하는 작업, 문서 내에서 특정 단어의 중요도를 구하는 작업 등에 쓰일 수 있음
- **TF-IDF = tf(d,t) * idf(d,t)** <br>
  (1) tf(d, t) : 특정 문서 d에서의 특정 단어 t의 등장 횟수 <br>
  (2) df(t) : 특정 단어 t가 등장한 문서의 수 <br>
  (3) idf(d, t) : df(t)에 반비례하는 수 $$idf(d,t) = log({N\over1+df(t)})\qquad ※N = number\ of\ docs$$ <br>

- TF-IDF 값이 낮으면 중요도가 낮은 것이며, TF-IDF 값이 크면 중요도가 큰 것 <br>
  (ex. the나 a와 같은 불용어는 모든 문서에 자주 등장하므로 TF-IDF 값이 낮아짐)


## 1. TF-IDF with Korean

In [2]:
# functions to calculate TF-IDF
def tf(t, d):
  return d.count(t)

def idf(t, docs):
  N = len(docs)
  df = 0
  for doc in docs:
    if t in doc.split():
      df +=1
  return log(N/(1+df))

def tfidf(t, d, docs):
  return tf(t, d) * idf(t, docs)


# generate corpus
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 

# build vocab with corpus
vocab = sorted(set(' '.join(docs).split()))
print(f'vocab : {vocab}\n')

# generate DTM with TF-IDF
result = []
for doc in docs:
  tmp_list = []
  for t in vocab:
    tmp_list.append(tfidf(t, doc, docs))
  result.append(tmp_list)

dtm_tfidf = pd.DataFrame(result, columns=vocab)
dtm_tfidf

vocab : ['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']



Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


## 2. TF-IDF with English
- We can use **`TfidfVectorizer`** in `scikit-learn` for English

In [3]:
# generate corpus
docs = [
  'you know I want your love',
  'I like you',
  'what should I do ',    
]

# generate DTM with TF-IDF using sklearn.TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_arr = vectorizer.fit_transform(docs).toarray()
cols = [tup[0] for tup in sorted(vectorizer.vocabulary_.items(), key=lambda tup: tup[1])]

pd.DataFrame(tfidf_arr, columns=cols)

Unnamed: 0,do,know,like,love,should,want,what,you,your
0,0.0,0.467351,0.0,0.467351,0.0,0.467351,0.0,0.355432,0.467351
1,0.0,0.0,0.795961,0.0,0.0,0.0,0.0,0.605349,0.0
2,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0,0.0
