# 내 풀이

In [3]:
import numpy as np

In [5]:
# vocabulary 생성
def build_dictionary(corpus):
    vocab_dict = {}
    for c in corpus:
        vocabs = c.strip().split() # 각 문장 내 unique 어휘 찾기
        for vocab in vocabs:
            vocab_key = vocab.lower() # 소문자 변환
            if vocab_key not in vocab_dict: # dictonary 내 색인 생성
                vocab_dict[vocab_key] = 0
    
    return sorted(vocab_dict)

# TF 계산
def calc_TF(corpus, vocab_dict):
    flag = len(vocab_dict)
    TF_matrix = [[v, 0]  for _ in range(len(corpus)) for v in vocab_dict]
    for i in range(len(corpus)):
        sent = corpus[i].strip().lower().split()
        vocab_cnt = 0
        for s in sent:
            for vocab in TF_matrix[i*flag:(i+1)*flag]:
                if s == vocab[0]:
                    vocab[1] += 1 # 빈도 추가
                    vocab_cnt += 1 # 총 등장 빈도 수
        # print(f"{sent}에서 총 등장한 어휘 빈도 수 : {vocab_cnt}")
        for j in range(flag):
            TF_matrix[i*flag:(i+1)*flag][j][1] /= vocab_cnt

    return TF_matrix

# IDF 계산
def calc_IDF(corpus, vocab_dict):
    # IDF 행렬 초기화
    IDF_matrix = [[vocab, 0] for vocab in vocab_dict]

    # IDF 계산
    for j in range(len(IDF_matrix)):
        for i in range(1, len(corpus)):
            sent = corpus[i].strip().lower().split()
            if IDF_matrix[j][0] in sent:
                IDF_matrix[j][0]
                IDF_matrix[j][1] += 1
    
    for i in range(len(IDF_matrix)):
        IDF_matrix[i][1] = np.log((len(corpus)-1)/IDF_matrix[i][1])

    return IDF_matrix * len(corpus)

# TF-IDF 계산
def calc_TFIDF(TF, IDF, corpus, vocab_dict):
    flag = len(vocab_dict)
    TFIDF_matrix = []

    TFIDF_matrix = []
    for i in range(len(corpus)):
        tf = np.array([tf[1] for tf in TF[i*flag:(i+1)*flag]], dtype=np.float32)
        idf = np.array([idf[1] for idf in IDF[i*flag:(i+1)*flag]], dtype=np.float32)
        TFIDF_matrix.append(tf*idf)
    return np.array(TFIDF_matrix).T

# 코사인 유사도 계산
def calc_cos_sim(A, B):
    # 분자
    bunja = A * B

    # 분모
    norm_A, norm_B = 0, 0
    for a in A:
        norm_A += a**2
    for b in B:
        norm_B += b**2
    bunmo = np.sqrt(norm_A) * np.sqrt(norm_B)

    return (bunja / bunmo).T.sum()

In [9]:
# 테스트
sentences = [
             'gold silver truck',
             'Shipment of gold damaged in a fire',
             'Delivery of silver arrived in a silver truck',
             'Shipment of gold arrived in a truck'
             ]

dictionary = build_dictionary(sentences)
print(f"vocabulary 사전: \n{dictionary}")
print("")

tf_matrix = calc_TF(sentences, dictionary)
print(f"Term Frequency: \n{tf_matrix}")
print("")

idf_matrix = calc_IDF(sentences, dictionary)
print(f"Inverse Document Frequency: \n{idf_matrix}")
print("")

tfidf_matrix = calc_TFIDF(tf_matrix, idf_matrix, sentences, dictionary)
print(f"TF-IDF: \n{tfidf_matrix}")
print("")

cos_sim_docs = {}
for i in range(1, len(sentences)):
    cos_sim_docs[sentences[i]] = calc_cos_sim(tfidf_matrix.T[0], tfidf_matrix.T[i])
print(f"문서 간 코사인유사도: \n{cos_sim_docs}")

vocabulary 사전: 
['a', 'arrived', 'damaged', 'delivery', 'fire', 'gold', 'in', 'of', 'shipment', 'silver', 'truck']

Term Frequency: 
[['a', 0.0], ['arrived', 0.0], ['damaged', 0.0], ['delivery', 0.0], ['fire', 0.0], ['gold', 0.3333333333333333], ['in', 0.0], ['of', 0.0], ['shipment', 0.0], ['silver', 0.3333333333333333], ['truck', 0.3333333333333333], ['a', 0.14285714285714285], ['arrived', 0.0], ['damaged', 0.14285714285714285], ['delivery', 0.0], ['fire', 0.14285714285714285], ['gold', 0.14285714285714285], ['in', 0.14285714285714285], ['of', 0.14285714285714285], ['shipment', 0.14285714285714285], ['silver', 0.0], ['truck', 0.0], ['a', 0.125], ['arrived', 0.125], ['damaged', 0.0], ['delivery', 0.125], ['fire', 0.0], ['gold', 0.0], ['in', 0.125], ['of', 0.125], ['shipment', 0.0], ['silver', 0.25], ['truck', 0.125], ['a', 0.14285714285714285], ['arrived', 0.14285714285714285], ['damaged', 0.0], ['delivery', 0.0], ['fire', 0.0], ['gold', 0.14285714285714285], ['in', 0.14285714285714285

# 다른 사람 풀이

## 1)

In [12]:
import numpy as np
from numpy import dot
from numpy.linalg import norm
Search = input("")

DOC1 = str.split('shipment of gold damaged in a fire')
DOC2 = str.split('delivery of silver arrived in a silver truck')
DOC3 = str.split('shipment of gold arrived in a truck')

Terms_raw = DOC1+DOC2+DOC3+str.split(Search)

Search2 = str.split(Search)

Terms = list(set(Terms_raw))
Terms.sort()
print(Terms)

DF = []
for Term in Terms:
     DF.append(list(set(DOC1)).count(Term)+
               list(set(DOC2)).count(Term)+
               list(set(DOC3)).count(Term))
    
IDF = []
for df in DF:
    IDF.append(np.log10(3/df))
print(IDF)

TF1 = []
TF2 = []
TF3 = []
TFsearch = []

for Term in Terms:
    TF1.append(DOC1.count(Term)/len(DOC1))
    TF2.append(DOC2.count(Term)/len(DOC2))
    TF3.append(DOC3.count(Term)/len(DOC3))
    TFsearch.append(Search2.count(Term)/len(Search2))
    
TF1_IDF = []
TF2_IDF = []
TF3_IDF = []
TFsearch_IDF = []
for i in range(len(Terms)):
     TF1_IDF.append(IDF[i]*TF1[i])
     TF2_IDF.append(IDF[i]*TF2[i])
     TF3_IDF.append(IDF[i]*TF3[i])
     TFsearch_IDF.append(IDF[i]*TFsearch[i])
print(TFsearch_IDF)


TF1_IDF= np.array(TF1_IDF)
TFsearch_IDF = np.array(TFsearch_IDF)
def cos_sim(A, B):
       return dot(A, B)/(norm(A)*norm(B))

print(cos_sim(TFsearch_IDF,TF1_IDF))
print(cos_sim(TFsearch_IDF,TF2_IDF))
print(cos_sim(TFsearch_IDF,TF3_IDF))

gold silver truck
['a', 'arrived', 'damaged', 'delivery', 'fire', 'gold', 'in', 'of', 'shipment', 'silver', 'truck']
[0.0, 0.17609125905568124, 0.47712125471966244, 0.47712125471966244, 0.47712125471966244, 0.17609125905568124, 0.0, 0.0, 0.17609125905568124, 0.47712125471966244, 0.17609125905568124]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.058697086351893746, 0.0, 0.0, 0.0, 0.15904041823988746, 0.058697086351893746]
0.08010451753994623
0.8247514231034944
0.32718457421366


## 2)

In [13]:
import numpy as np
import pandas as pd
from math import log

# 단어리스트 만들기
docs= ["gold silver truck",\
    "Shipment of gold damaged in a fire",\
        "Delivery of silver arrived in a silver truck",\
            "Shipment of gold arrived in a truck"]
wordset= list(set(w for doc in docs for w in doc.split()))
wordset.sort()

# 함수 정의하기
N= len(docs)

def tf(t,d):
    if len(t) !=1:
        return d.count(t)
    else:
        return int(t in d)

def idf(t):
    df = 0
    for doc in docs:
            df += t in doc
    return log(N/df)

def tfidf(t, d):
    return tf(t,d)* idf(t)

# DTM
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(wordset)):
        t = wordset[j]        
        result[-1].append(tf(t, d))

dtm_ = pd.DataFrame(result, columns = wordset)
dtm_

df_= dtm_.sum(axis= 1)

# IDF
result2 = []
for j in range(len(wordset)):
    t = wordset[j]
    result2.append(idf(t))

idf_ = pd.DataFrame(result2, index = wordset, columns = ["IDF"])
idf_

# TF-IDF
result3 = []
for i in range(N):
    result3.append([])
    d = docs[i]
    for j in range(len(wordset)):
        t = wordset[j]

        result3[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result3, columns = wordset)
tfidf_

print("DTM: \n",dtm_)
print("\n DF: \n",df_)
print("\n IDF: \n", idf_)
print("\n TFIDF: \n", tfidf_)

DTM: 
    Delivery  Shipment  a  arrived  damaged  fire  gold  in  of  silver  truck
0         0         0  0        0        0     0     1   0   0       1      1
1         0         1  1        0        1     1     1   1   1       0      0
2         1         0  1        1        0     0     0   1   1       2      1
3         0         1  1        1        0     0     1   1   1       0      1

 DF: 
 0    3
1    7
2    8
3    7
dtype: int64

 IDF: 
                IDF
Delivery  1.386294
Shipment  0.693147
a         0.287682
arrived   0.693147
damaged   1.386294
fire      1.386294
gold      0.287682
in        0.287682
of        0.287682
silver    0.693147
truck     0.287682

 TFIDF: 
    Delivery  Shipment         a  ...        of    silver     truck
0  0.000000  0.000000  0.000000  ...  0.000000  0.693147  0.287682
1  0.000000  0.693147  0.287682  ...  0.287682  0.000000  0.000000
2  1.386294  0.000000  0.287682  ...  0.287682  1.386294  0.287682
3  0.000000  0.693147  0.287682  ...  

## 3)

In [14]:
# 자연어처리 쿡북 : p.227, 텍스트 유사도 문제 (TFIDF)
import nltk
import math
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

docs = [
            'gold silver truck',
            'Shipment of gold damaged in a fire',
            'Delivery of silver arrived in a silver truck',
            'Shipment of gold arrived in a truck'
        ]

word_set = set()
for doc in docs:
    words = nltk.word_tokenize(doc.lower())
    word_set = word_set.union(set(words))
word_list = list(word_set)
word_list.sort()
word_list

def TF(my_word, doc):
    count = 0

    words = nltk.word_tokenize(doc.lower())
    for word in words:
        if my_word == word:
            count+=1
    return count/len(words)
TF('silver', docs[2])

def IDF(my_word, docs):
    count = 0
    for doc in docs:
        words = nltk.word_tokenize(doc.lower())
        for word in words:
            if my_word == word:
                count+=1
                break
    return len(docs) / count

word_idf_dict = {}
for word in word_list:
    word_idf_dict[word] = IDF(word, docs)
word_idf_dict

import math 

def TF_IDF(doc):
    doc_tf_idf_list = []
    for word in word_list:
        tf = TF(word, doc)
        word_tf_idf = tf*math.log(word_idf_dict[word])
        doc_tf_idf_list.append(word_tf_idf)
    return doc_tf_idf_list
TF_IDF(docs[1])

doc_TF_IDF_list = []
for doc in docs:
    doc_TF_IDF_list.append(TF_IDF(doc))

from sklearn.metrics.pairwise import cosine_similarity
print(docs)
cosine_similarity(doc_TF_IDF_list, doc_TF_IDF_list)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['gold silver truck', 'Shipment of gold damaged in a fire', 'Delivery of silver arrived in a silver truck', 'Shipment of gold arrived in a truck']


array([[1.        , 0.0477257 , 0.60185049, 0.17564754],
       [0.0477257 , 1.        , 0.05333544, 0.32078621],
       [0.60185049, 0.05333544, 1.        , 0.32078621],
       [0.17564754, 0.32078621, 0.32078621, 1.        ]])

# 강사님 풀이

In [15]:
# TFIDF 연습
# 2020.07-21
# ----------
import nltk
import numpy as np

# 1. dictionary를 생성한다.
def makeVocab(sentences):
    words = [word for sentence in sentences for word in sentence.split()]
    words = list(set(words))
    words.sort()
    return {word: idx for idx, word in enumerate(words)}

# 2. TF를 생성한다.
def makeTF(sentences):
    vocab = makeVocab(sentences)
    tf = np.zeros((len(vocab), len(sentences)))
    for i, sentence in enumerate(sentences):
        freq = nltk.FreqDist(nltk.word_tokenize(sentence))
        for key in freq.keys():
            tf[vocab[key], i] = freq[key] / len(sentence)
    return tf

# 3. IDF를 생성한다.
def makeIDF(sentences, tf):
    df = tf.shape[1] - (tf == 0.0).sum(axis=1)
    return np.log(tf.shape[1] / (0+df))

# 4. TFIDF를 생성한다.
def makeTFIDF(sentences):
    tf = makeTF(sentences)
    idf = makeIDF(sentences, tf)
    return np.multiply(tf, idf.reshape(tf.shape[0], 1))

sentences = ['gold silver truck', 'shipment of gold damaged in a fire', 'delivery of silver arrived in a silver truck', 'shipment of gold arrived in a truck']
tfidf = makeTFIDF(sentences);
print(tfidf.round(4))

[[0.     0.0085 0.0065 0.0082]
 [0.     0.     0.0158 0.0198]
 [0.     0.0408 0.     0.    ]
 [0.     0.     0.0315 0.    ]
 [0.     0.0408 0.     0.    ]
 [0.0169 0.0085 0.     0.0082]
 [0.     0.0085 0.0065 0.0082]
 [0.     0.0085 0.0065 0.0082]
 [0.     0.0204 0.     0.0198]
 [0.0408 0.     0.0315 0.    ]
 [0.0169 0.     0.0065 0.0082]]
