In [253]:
import pandas as pd
# from math import log
import numpy as np

In [254]:
corpus = [
    '배우고 싶은 자연어',
    '배우고 싶은 딥러닝',
    '딥러닝 머신러닝 배우고 싶은 머신러닝',
    '자연어 처리 좋아요'
]

In [255]:
vocab = list(set([w for doc in corpus for w in doc.split()]))
vocab.sort()

In [256]:
vocab

['딥러닝', '머신러닝', '배우고', '싶은', '자연어', '좋아요', '처리']

In [257]:
N = len(vocab)
N

7

In [258]:
# 문서수
dc = len(corpus)

# 단어빈도
def tf(t, d):
    return d.count(t)

# Inverse-문서빈도
def idf(t) :
    df = 0
    for doc in corpus:
        df += t in doc
        
    # return  np.log( dc / (df + 1) )   
    return  np.log( (dc + 1) / (df + 1) ) + 1   # 전체문서수 / 단어가 등장하는 문서 수 

# 단어빈도 * Inverse-문서빈도 => 가중치
def tfidf(t, d):
    return tf(t,d) * idf(t)    

In [259]:
result = []
for i in range(dc) :
    result.append([])
    d = corpus[i]
    for j in range(len(vocab)) :
        t = vocab[j]
        result[-1].append(tf(t,d))

tf_ = pd.DataFrame(result, columns = vocab)   

In [260]:
tf_

Unnamed: 0,딥러닝,머신러닝,배우고,싶은,자연어,좋아요,처리
0,0,0,1,1,1,0,0
1,1,0,1,1,0,0,0
2,1,2,1,1,0,0,0
3,0,0,0,0,1,1,1


In [261]:
result = []
for j in range(len(vocab)) :
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=['IDF'])
idf_

Unnamed: 0,IDF
딥러닝,1.510826
머신러닝,1.916291
배우고,1.223144
싶은,1.223144
자연어,1.510826
좋아요,1.916291
처리,1.916291


In [262]:
result = []
for i in range(dc) :
    result.append([])
    d = corpus[i]
    for j in range(len(vocab)) :
        t = vocab[j]
        result[-1].append(tfidf(t,d))
tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,딥러닝,머신러닝,배우고,싶은,자연어,좋아요,처리
0,0.0,0.0,1.223144,1.223144,1.510826,0.0,0.0
1,1.510826,0.0,1.223144,1.223144,0.0,0.0,0.0
2,1.510826,3.832581,1.223144,1.223144,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.510826,1.916291,1.916291


In [263]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [252]:
TfidfVectorizer?

[1;31mInit signature:[0m
[0mTfidfVectorizer[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0minput[0m[1;33m=[0m[1;34m'content'[0m[1;33m,[0m[1;33m
[0m    [0mencoding[0m[1;33m=[0m[1;34m'utf-8'[0m[1;33m,[0m[1;33m
[0m    [0mdecode_error[0m[1;33m=[0m[1;34m'strict'[0m[1;33m,[0m[1;33m
[0m    [0mstrip_accents[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlowercase[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mpreprocessor[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtokenizer[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0manalyzer[0m[1;33m=[0m[1;34m'word'[0m[1;33m,[0m[1;33m
[0m    [0mstop_words[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtoken_pattern[0m[1;33m=[0m[1;34m'(?u)\\b\\w\\w+\\b'[0m[1;33m,[0m[1;33m
[0m    [0mngram_range[0m[1;33m=[0m[1;33m([0m[1;36m1[0m[1;33m,[0m [1;36m1[0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [0m

In [264]:
# TfidfVectorizer 초기화
tfidfvect = TfidfVectorizer(norm=None)  # norm=None으로 설정하여 정규화를 제거
tfidf_matrix = tfidfvect.fit_transform(corpus)

# 단어 목록
feature_names = tfidfvect.get_feature_names_out()

# 1. TF (Term Frequency) 계산
# TF를 얻기 위해 use_idf=False로 새로운 vectorizer 생성
tf_tfidfvect = TfidfVectorizer(use_idf=False, norm=None)
tf_matrix = tf_tfidfvect.fit_transform(corpus)
tf_values = pd.DataFrame(
    tf_matrix.toarray(),
    columns=feature_names,
    index=[f'문서_{i+1}' for i in range(len(corpus))]
)

# 2. IDF 값
idf_values = pd.Series(
    tfidfvect.idf_,
    index=feature_names
)

# 3. TF-IDF 값 (최종 결과)
tfidf_values = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=feature_names,
    index=[f'문서_{i+1}' for i in range(len(corpus))]
)

# 결과 출력
print("=== 1. TF (Term Frequency) 값 ===")
print(tf_values)
print("\n=== 2. IDF (Inverse Document Frequency) 값 ===")
print(idf_values)
print("\n=== 3. TF-IDF 값 (TF * IDF) ===")
print(tfidf_values)

# 계산 검증
print("\n=== 계산 검증 ===")
# 특정 단어에 대해 TF * IDF 계산이 TF-IDF와 일치하는지 확인
word = "파이썬"
if word in feature_names:
    print(word)
    for doc_idx in range(len(corpus)):
        tf = tf_values.iloc[doc_idx][word]
        idf = idf_values[word]
        tfidf = tfidf_values.iloc[doc_idx][word]
        print(f"\n문서_{doc_idx+1}의 '{word}' 단어:")
        print(f"TF: {tf:.4f}")
        print(f"IDF: {idf:.4f}")
        print(f"계산된 TF-IDF (TF * IDF): {tf * idf:.4f}")
        print(f"실제 TF-IDF 값: {tfidf:.4f}")

=== 1. TF (Term Frequency) 값 ===
      딥러닝  머신러닝  배우고   싶은  자연어  좋아요   처리
문서_1  0.0   0.0  1.0  1.0  1.0  0.0  0.0
문서_2  1.0   0.0  1.0  1.0  0.0  0.0  0.0
문서_3  1.0   2.0  1.0  1.0  0.0  0.0  0.0
문서_4  0.0   0.0  0.0  0.0  1.0  1.0  1.0

=== 2. IDF (Inverse Document Frequency) 값 ===
딥러닝     1.510826
머신러닝    1.916291
배우고     1.223144
싶은      1.223144
자연어     1.510826
좋아요     1.916291
처리      1.916291
dtype: float64

=== 3. TF-IDF 값 (TF * IDF) ===
           딥러닝      머신러닝       배우고        싶은       자연어       좋아요        처리
문서_1  0.000000  0.000000  1.223144  1.223144  1.510826  0.000000  0.000000
문서_2  1.510826  0.000000  1.223144  1.223144  0.000000  0.000000  0.000000
문서_3  1.510826  3.832581  1.223144  1.223144  0.000000  0.000000  0.000000
문서_4  0.000000  0.000000  0.000000  0.000000  1.510826  1.916291  1.916291

=== 계산 검증 ===


In [246]:
# 계산 검증
print("\n=== 계산 검증 ===")
# 특정 단어에 대해 TF * IDF 계산이 TF-IDF와 일치하는지 확인
word = "딥러닝"
if word in feature_names:
    print(word)
    for doc_idx in range(len(corpus)):
        tf = tf_values.iloc[doc_idx][word]
        idf = idf_values[word]
        tfidf = tfidf_values.iloc[doc_idx][word]
        print(f"\n문서_{doc_idx+1}의 '{word}' 단어:")
        print(f"TF: {tf:.4f}")
        print(f"IDF: {idf:.4f}")
        print(f"계산된 TF-IDF (TF * IDF): {tf * idf:.4f}")
        print(f"실제 TF-IDF 값: {tfidf:.4f}")


=== 계산 검증 ===
딥러닝

문서_1의 '딥러닝' 단어:
TF: 0.0000
IDF: 1.5108
계산된 TF-IDF (TF * IDF): 0.0000
실제 TF-IDF 값: 0.0000

문서_2의 '딥러닝' 단어:
TF: 1.0000
IDF: 1.5108
계산된 TF-IDF (TF * IDF): 1.5108
실제 TF-IDF 값: 1.5108

문서_3의 '딥러닝' 단어:
TF: 1.0000
IDF: 1.5108
계산된 TF-IDF (TF * IDF): 1.5108
실제 TF-IDF 값: 1.5108

문서_4의 '딥러닝' 단어:
TF: 0.0000
IDF: 1.5108
계산된 TF-IDF (TF * IDF): 0.0000
실제 TF-IDF 값: 0.0000
