## TF-IDF (Term Frequency - Inverse Document Frequency)
DTM을 기반으로 중요한 단어에 가중치는 주는 방식이에요! 결과적으로만 말하자면 TF-IDF의 값을 기반으로 중요한 단어는 값이 올라가고, TF-IDF 기준으로 중요하지 않은 단어는 값이 DOWN이 됩니다!

tf-idf의 정의에 대해 이야기 해보겠습니다.

* $tf(d, t)$ : 특정 문서 d에서의 특정 단어 $t$의 등장 횟수. 즉 DTM 상에서의 단어들의 값
* $df(t)$ : 단어 $t$가 등장한 문서의 수
* $idf(t)$ : $df(t)$에 반비례 하는 수.

참고로 idf는 다음과 같아요!

$$
idf(t) = log(\frac{n}{1+df(t)})
$$

위 식에서 $n$은 문서의 개수입니다!

In [13]:
from collections import Counter
import numpy as np

# tf 구현하기
# n번 문서(document)에서 (term)이 등장한 횟수
def term_frequency(term, document):
  return document.count(term)

# term(단어)이 몇개의 문서'들'에서(documents)에서 등장을 했는지 세어주기
def document_frequency(term, documents):
  term_count = 0

  for document in documents:
    term_count += term in document

  return term_count

def inverse_document_frquency(term, documents):
  df = document_frequency(term, documents)
  return np.log(len(documents) / (1 + df))

def tf_idf(term, documents, idx):
  tf = term_frequency(term, documents[idx])
  idf = inverse_document_frquency(term, documents)
  return tf * idf

In [None]:
!pip install konlpy

In [4]:
from konlpy.tag import Okt
okt = Okt()

docs = [
  '동해 물과 백두산이 마르고 닳도록 하느님이 보우하사 우리나라 만세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 동해 가고 싶다',
  '남산 위에 저 소나무, 철갑을 두른 듯 바람 서리 불변함은 우리 기상일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 소나무 이쁘다',
  '가을 하늘 공활한데 높고 구름 없이 밝은 달은 우리 가슴 일편단심일세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 가을 하늘 보고 싶다.',
  '이 기상과 이 마음으로 충성을 다하여 괴로우나 즐거우나 나라 사랑하세. 무궁화 삼천리 화려 강산 대한 사람, 대한으로 길이 보전하세. 나라를 사랑하자'
] 

In [5]:
vocab = list(set(w for doc in docs for w in okt.nouns(doc)))
vocab

['길이',
 '저',
 '마음',
 '마르고',
 '물',
 '가슴',
 '대한',
 '보전',
 '사랑',
 '달',
 '불변',
 '기상',
 '동해',
 '듯',
 '우리나라',
 '우리',
 '이',
 '소나무',
 '일편단심',
 '화려',
 '함',
 '서리',
 '가을',
 '사람',
 '나라',
 '삼천리',
 '충성',
 '구름',
 '보우',
 '보고',
 '데',
 '하늘',
 '위',
 '만세',
 '하사',
 '남산',
 '백두산',
 '무궁화',
 '하느님',
 '철갑',
 '바람',
 '강산',
 '활']

In [6]:
# 가나다 순으로 정렬
vocab.sort()

In [10]:
import pandas as pd
# tf 결과를 데이터프레임으로 확인
result = []

for i in range(len(docs)):
  result.append([])
  d = docs[i]

  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(term_frequency(t, d))

tf_ = pd.DataFrame(result, columns=vocab)
tf_

Unnamed: 0,가슴,가을,강산,구름,기상,길이,나라,남산,달,대한,데,동해,듯,마르고,마음,만세,무궁화,물,바람,백두산,보고,보우,보전,불변,사람,사랑,삼천리,서리,소나무,우리,우리나라,위,이,일편단심,저,철갑,충성,하느님,하늘,하사,함,화려,활
0,0,0,1,0,0,1,1,0,0,2,0,2,0,1,0,1,1,1,0,1,0,1,1,0,1,0,1,0,0,1,1,0,3,0,0,0,0,1,0,1,0,1,0
1,0,0,1,0,1,1,0,1,0,2,0,0,1,0,0,0,1,0,1,0,0,0,1,1,1,0,1,1,2,1,0,1,2,0,1,1,0,0,0,0,1,1,0
2,1,2,1,1,0,1,0,0,1,2,1,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,2,1,0,0,0,0,2,0,0,1,1
3,0,0,1,0,1,1,2,0,0,2,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,2,1,0,0,0,0,0,3,0,0,0,1,0,0,0,0,1,0


In [11]:
# idf 결과를 데이터 프레임으로 확인
result = []
for j in range(len(vocab)):
  t = vocab[j]
  result.append(inverse_document_frquency(t, docs))

idf_ = pd.DataFrame(result, index=vocab)
idf_

Unnamed: 0,0
가슴,0.693147
가을,0.693147
강산,-0.223144
구름,0.693147
기상,0.287682
길이,-0.223144
나라,0.287682
남산,0.693147
달,0.693147
대한,-0.223144


In [14]:
# tf_idf 구하기
result = []

for i in range(len(docs)):
  result.append([])
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tf_idf(t, docs, i))

tfidf_ = pd.DataFrame(result, columns=vocab)
tfidf_


Unnamed: 0,가슴,가을,강산,구름,기상,길이,나라,남산,달,대한,데,동해,듯,마르고,마음,만세,무궁화,물,바람,백두산,보고,보우,보전,불변,사람,사랑,삼천리,서리,소나무,우리,우리나라,위,이,일편단심,저,철갑,충성,하느님,하늘,하사,함,화려,활
0,0.0,0.0,-0.223144,0.0,0.0,-0.223144,0.287682,0.0,0.0,-0.446287,0.0,1.386294,0.0,0.693147,0.0,0.693147,-0.223144,0.693147,0.0,0.693147,0.0,0.693147,-0.223144,0.0,-0.223144,0.0,-0.223144,0.0,0.0,0.0,0.693147,0.0,-0.669431,0.0,0.0,0.0,0.0,0.693147,0.0,0.693147,0.0,-0.223144,0.0
1,0.0,0.0,-0.223144,0.0,0.287682,-0.223144,0.0,0.693147,0.0,-0.446287,0.0,0.0,0.693147,0.0,0.0,0.0,-0.223144,0.0,0.693147,0.0,0.0,0.0,-0.223144,0.693147,-0.223144,0.0,-0.223144,0.693147,1.386294,0.0,0.0,0.693147,-0.446287,0.0,0.693147,0.693147,0.0,0.0,0.0,0.0,0.693147,-0.223144,0.0
2,0.693147,1.386294,-0.223144,0.693147,0.0,-0.223144,0.0,0.0,0.693147,-0.446287,0.693147,0.0,0.0,0.0,0.0,0.0,-0.223144,0.0,0.0,0.0,0.693147,0.0,-0.223144,0.0,-0.223144,0.0,-0.223144,0.0,0.0,0.0,0.0,0.0,-0.446287,0.693147,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,-0.223144,0.693147
3,0.0,0.0,-0.223144,0.0,0.287682,-0.223144,0.575364,0.0,0.0,-0.446287,0.0,0.0,0.0,0.0,0.693147,0.0,-0.223144,0.0,0.0,0.0,0.0,0.0,-0.223144,0.0,-0.223144,1.386294,-0.223144,0.0,0.0,0.0,0.0,0.0,-0.669431,0.0,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,-0.223144,0.0


## 텐서플로우로 BOW(Bag Of Words) 구현하기

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [16]:
t = Tokenizer()
t.fit_on_texts(docs)
print(t.word_index)

{'무궁화': 1, '삼천리': 2, '화려': 3, '강산': 4, '대한': 5, '사람': 6, '대한으로': 7, '길이': 8, '보전하세': 9, '동해': 10, '싶다': 11, '소나무': 12, '우리': 13, '가을': 14, '하늘': 15, '이': 16, '물과': 17, '백두산이': 18, '마르고': 19, '닳도록': 20, '하느님이': 21, '보우하사': 22, '우리나라': 23, '만세': 24, '가고': 25, '남산': 26, '위에': 27, '저': 28, '철갑을': 29, '두른': 30, '듯': 31, '바람': 32, '서리': 33, '불변함은': 34, '기상일세': 35, '이쁘다': 36, '공활한데': 37, '높고': 38, '구름': 39, '없이': 40, '밝은': 41, '달은': 42, '가슴': 43, '일편단심일세': 44, '보고': 45, '기상과': 46, '마음으로': 47, '충성을': 48, '다하여': 49, '괴로우나': 50, '즐거우나': 51, '나라': 52, '사랑하세': 53, '나라를': 54, '사랑하자': 55}


In [17]:
# DTM 만들기
print(t.texts_to_matrix(docs, mode='count'))

[[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 2. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1.]]


In [19]:
# Binary 모드 : 있는지 없는지에 대한 인코딩만 수행함( 있으면 1, 없으면 0)
print(t.texts_to_matrix(docs, mode='binary'))

[[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1.]]


In [18]:
# tf_idf 모드
print(t.texts_to_matrix(docs, mode='tfidf'))

[[0.         0.58778666 0.58778666 0.58778666 0.58778666 0.58778666
  0.58778666 0.58778666 0.58778666 0.58778666 1.8601123  0.84729786
  0.         0.         0.         0.         0.         1.09861229
  1.09861229 1.09861229 1.09861229 1.09861229 1.09861229 1.09861229
  1.09861229 1.09861229 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.58778666 0.58778666 0.58778666 0.58778666 0.58778666
  0.58778666 0.58778666 0.58778666 0.58778666 0.         0.
  1.8601123  0.84729786 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         1.09861229 1.09861229 1.09861229 1.09861229
  1.09861229 1.09861229 1.09861229 1.09861229 1.09861229 1.09861229
  1.09861229 0.    