# 문장의 표현 (Sentence Representation)

# 1 BoW (Bag of Words)

<img src="https://image.slidesharecdn.com/vector-space-models-170118145044/95/cs571-vector-space-models-3-638.jpg?cb=1485433004" />

https://en.wikipedia.org/wiki/Bag-of-words_model
https://www.slideshare.net/jchoi7s/cs571-vector-space-models

## 1.1 직접구현

In [1]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

### 1) 띄어쓰기 단위로 토큰화

In [2]:
word_ls = [doc.split() for doc in docs]
word_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [3]:
word2id = {}

for doc_tokens in word_ls:
    for token in doc_tokens:
        if token not in word2id.keys():
            word2id[token] = len(word2id)

word2id

{'오늘': 0,
 '동물원에서': 1,
 '원숭이를': 2,
 '봤어': 3,
 '코끼리를': 4,
 '원숭이에게': 5,
 '바나나를': 6,
 '줬어': 7}

### 3) BoW 생성

In [4]:
# 1. word2id 반복문 돌리는 방법
BoW_ls = []

for doc_tokens in word_ls:
    BoW_temp = [0] * len(word2id)

    for token in word2id:
        BoW_temp[word2id[token]] = doc_tokens.count(token)
    BoW_ls.append(BoW_temp)

BoW_ls

[[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [5]:
# 2. doc_tokens 반복문 돌리는 방법
BoW_ls = []

for doc_tokens in word_ls:
    BoW_temp = [0] * len(word2id)

    for token in doc_tokens:
        BoW_temp[word2id[token]] += 1
    BoW_ls.append(BoW_temp)

BoW_ls

[[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [6]:
sorted((value, key) for key, value in word2id.items())

[(0, '오늘'),
 (1, '동물원에서'),
 (2, '원숭이를'),
 (3, '봤어'),
 (4, '코끼리를'),
 (5, '원숭이에게'),
 (6, '바나나를'),
 (7, '줬어')]

In [7]:
from IPython.core import display as ICD
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]        # ['오늘', '동물원에서', ..., '줬어']

for i in range(len(docs)) :
    print("문서{} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([BoW_ls[i]], columns=vocab))
    print("\n\n")

문서0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,1,1,0,0,0,0





문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,1,1,0,2,1,0,0,0





문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0,1,0,0,0,1,2,1









---





## 1.2 단어 순서를 고려하지 않은 BoW

In [8]:
docs = [
    '나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해',
    '나는 후라이드 치킨을 좋아해 하지만 양념 치킨을 싫어해'
]

### 1) 띄어쓰기 단위로 토큰화

In [9]:
word_ls = [doc.split() for doc in docs]
word_ls

[['나는', '양념', '치킨을', '좋아해', '하지만', '후라이드', '치킨을', '싫어해'],
 ['나는', '후라이드', '치킨을', '좋아해', '하지만', '양념', '치킨을', '싫어해']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [10]:
word2id = {}

for doc_tokens in word_ls:
    for token in doc_tokens:
        if token not in word2id.keys():
            word2id[token] = len(word2id)

word2id

{'나는': 0, '양념': 1, '치킨을': 2, '좋아해': 3, '하지만': 4, '후라이드': 5, '싫어해': 6}

### 3) BoW 생성

In [14]:
from IPython.core import display as ICD

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
for i in range(len(docs)) :
    print("문서{} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([BoW_ls[i]], columns=vocab))
    print("\n\n")

문서0 : 나는 양념 치킨을 좋아해 하지만 후라이드 치킨을 싫어해


ValueError: 7 columns passed, passed data had 8 columns



---



https://en.wikipedia.org/wiki/Document-term_matrix

## 1.3 sklearn 활용

In [11]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
BoW = count_vect.fit_transform(docs)

BoW.toarray()[0]

array([1, 0, 1, 1, 1, 0, 0, 0], dtype=int64)

In [13]:
count_vect.get_feature_names_out()

array(['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를'],
      dtype=object)

In [14]:
from IPython.core import display as ICD

vocab = count_vect.get_feature_names_out()
for i in range(len(docs)) :
    print("문서{} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([BoW.toarray()[i]], columns=vocab))
    print("\n\n")

문서0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,0,1,1,1,0,0,0





문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,0,2,1,0,0,0,1





문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,1,2,0,0,0,1,1,0









---


## 1.4 gensim 활용

In [15]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [16]:
import gensim
import numpy as np
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
BoW = [id2word.doc2bow(doc) for doc in doc_ls]
BoW

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [17]:
dict(id2word)

{0: '동물원에서',
 1: '봤어',
 2: '오늘',
 3: '원숭이를',
 4: '코끼리를',
 5: '바나나를',
 6: '원숭이에게',
 7: '줬어'}

In [18]:
from gensim.matutils import sparse2full
from IPython.core import display as ICD

vocab = [id2word[i] for i in id2word.keys()]
for i in range(len(docs)) :
    print("문서{} : {}".format(i, docs[i]))
    ICD.display(pd.DataFrame([sparse2full(BoW[i], len(vocab))], columns=vocab))
    print("\n\n")

문서0 : 오늘 동물원에서 원숭이를 봤어


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0





문서1 : 오늘 동물원에서 코끼리를 봤어 봤어


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0





문서2 : 동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0









---



# 2 TDM(Term-Document Matrix)

## 2.1 직접구현

In [19]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

### 1) 띄어쓰기 단위로 토큰화

In [24]:
word_ls = [doc.split() for doc in docs]
word_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [20]:
word2id = {}

for doc_tokens in word_ls:
    for token in doc_tokens:
        if token not in word2id.keys():
            word2id[token] = len(word2id)

word2id

{'나는': 0, '양념': 1, '치킨을': 2, '좋아해': 3, '하지만': 4, '후라이드': 5, '싫어해': 6}

### 3) TDM 생성

## 2.2 sklearn 활용

In [21]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 1, 1, 1, 0, 0, 0],
       [1, 0, 2, 1, 0, 0, 0, 1],
       [1, 2, 0, 0, 0, 1, 1, 0]], dtype=int64)

In [23]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
vocab = count_vect.get_feature_names_out()
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동물원에서,1,1,1
바나나를,0,0,2
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
원숭이에게,0,0,1
줬어,0,0,1
코끼리를,0,1,0




---


## 2.3 gensim 활용

In [24]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [25]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs] #공백으로 토큰화
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
TDM

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 2), (2, 1), (4, 1)],
 [(0, 1), (5, 2), (6, 1), (7, 1)]]

In [26]:
import pandas as pd

doc_names = ['문서'+ str(i) for i in range(len(doc_ls))]
vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in TDM]

df_TDM = pd.DataFrame(np.array(DTM_matrix, dtype=int).T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
동물원에서,1,1,1
봤어,1,2,0
오늘,1,1,0
원숭이를,1,0,0
코끼리를,0,1,0
바나나를,0,0,2
원숭이에게,0,0,1
줬어,0,0,1


In [27]:
DTM_matrix

[[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 2.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0, 1.0]]

---

# 3 TF-IDF (Term Frequency-Inverse Document Frequency)

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/10109d0e60cc9d50a1ea2f189bac0ac29a030a00" />



*  TF(단어 빈도, Term Frequency) : 단어가 문서 내에 등장하는 빈도
*  IDF(역문서 빈도, Inverse Document Frequency) : 단어가 여러 문서에 공통적으로 등장하는 빈도
*  한 문서 내에 자주 등장하고 다른 문서에 자주 등장하지 않는 단어를 주요 단어로 판별할 수 있음


https://en.wikipedia.org/wiki/Tf%E2%80%93idf

## 3.1 직접계산하기 1

weighting schema|weight
--|--
tf (term frequency)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />
idf(inverse document frequency) |<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

In [28]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

### 1) 띄어쓰기 단위로 토큰화

In [29]:
word_ls = [doc.split() for doc in docs]
word_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### 2) 각 고유 토큰에 인덱스(Index)를 지정

In [30]:
dict(enumerate(set(sum(word_ls, []))))

{0: '코끼리를',
 1: '원숭이에게',
 2: '동물원에서',
 3: '봤어',
 4: '바나나를',
 5: '줬어',
 6: '오늘',
 7: '원숭이를'}

In [31]:
id2word = dict(enumerate(set(sum(word_ls, []))))
word2id = {value: key for key, value in id2word.items()}
word2id

{'코끼리를': 0,
 '원숭이에게': 1,
 '동물원에서': 2,
 '봤어': 3,
 '바나나를': 4,
 '줬어': 5,
 '오늘': 6,
 '원숭이를': 7}

### 3) TDM 생성

In [32]:
dtm_ls = []

for tokens in word_ls:
    tdm_temp = [0] * len(word2id)
    for token in tokens:
        tdm_temp[word2id[token]] += 1
    dtm_ls.append(tdm_temp)

tdm_df = pd.DataFrame(dtm_ls, columns=word2id).T
tdm_df

Unnamed: 0,0,1,2
코끼리를,0,1,0
원숭이에게,0,0,1
동물원에서,1,1,1
봤어,1,2,0
바나나를,0,0,2
줬어,0,0,1
오늘,1,1,0
원숭이를,1,0,0


### 4) TF 계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />

TF = 문서내 토크빈도/ 문서내 전체토큰갯수

In [33]:
doc_columns = tdm_df.columns
doc_columns

RangeIndex(start=0, stop=3, step=1)

In [34]:
for doc_no in doc_columns:
    tdm_df[f'tf_{doc_no}'] = tdm_df[doc_no] / tdm_df[doc_no].sum()

tdm_df

Unnamed: 0,0,1,2,tf_0,tf_1,tf_2
코끼리를,0,1,0,0.0,0.2,0.0
원숭이에게,0,0,1,0.0,0.0,0.2
동물원에서,1,1,1,0.25,0.2,0.2
봤어,1,2,0,0.25,0.4,0.0
바나나를,0,0,2,0.0,0.0,0.4
줬어,0,0,1,0.0,0.0,0.2
오늘,1,1,0,0.25,0.2,0.0
원숭이를,1,0,0,0.25,0.0,0.0


### 5) IDF  계산

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />

IDF = log(총문서수/토큰이등장한문서수)

In [35]:
# tdm_df['idf'] = -np.log((tdm_df[doc_columns] > 0).sum(axis=1) / len(doc_columns))
tdm_df['idf'] = np.log(len(doc_columns) / (tdm_df[doc_columns] > 0).sum(axis=1))

tdm_df

Unnamed: 0,0,1,2,tf_0,tf_1,tf_2,idf
코끼리를,0,1,0,0.0,0.2,0.0,1.098612
원숭이에게,0,0,1,0.0,0.0,0.2,1.098612
동물원에서,1,1,1,0.25,0.2,0.2,0.0
봤어,1,2,0,0.25,0.4,0.0,0.405465
바나나를,0,0,2,0.0,0.0,0.4,1.098612
줬어,0,0,1,0.0,0.0,0.2,1.098612
오늘,1,1,0,0.25,0.2,0.0,0.405465
원숭이를,1,0,0,0.25,0.0,0.0,1.098612


### 6) TF-IDF 계산

---



In [36]:
for doc_no in doc_columns:
    tdm_df[f'tfidf_{doc_no}'] = tdm_df[f'tf_{doc_no}'] * tdm_df['idf']
tdm_df

Unnamed: 0,0,1,2,tf_0,tf_1,tf_2,idf,tfidf_0,tfidf_1,tfidf_2
코끼리를,0,1,0,0.0,0.2,0.0,1.098612,0.0,0.219722,0.0
원숭이에게,0,0,1,0.0,0.0,0.2,1.098612,0.0,0.0,0.219722
동물원에서,1,1,1,0.25,0.2,0.2,0.0,0.0,0.0,0.0
봤어,1,2,0,0.25,0.4,0.0,0.405465,0.101366,0.162186,0.0
바나나를,0,0,2,0.0,0.0,0.4,1.098612,0.0,0.0,0.439445
줬어,0,0,1,0.0,0.0,0.2,1.098612,0.0,0.0,0.219722
오늘,1,1,0,0.25,0.2,0.0,0.405465,0.101366,0.081093,0.0
원숭이를,1,0,0,0.25,0.0,0.0,1.098612,0.274653,0.0,0.0


In [37]:
doc0_vec = tdm_df['tfidf_0'].values
doc0_vec

array([0.        , 0.        , 0.        , 0.10136628, 0.        ,
       0.        , 0.10136628, 0.27465307])

In [None]:
import pandas as pd


## 3.2 직접계산하기2

weighting schema|weight|설명
--|--|--
tf(double normalization 0.5)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/45badc1c70ec2caa00ed8c21ed75bd9f8d3e650c" />|=0.5 + 0.5(토큰빈도/문서내최빈토큰)
idf(inverse document frequency smooth)|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/23e5ae785c1ddc6bd95d404ea3fac2477fff5eff" />|=log(문서갯수/(1+토큰빈도))

In [None]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

## 3.3 sklearn 활용

In [80]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(docs)
tfidf.todense()

matrix([[0.37311881, 0.        , 0.4804584 , 0.4804584 , 0.63174505,
         0.        , 0.        , 0.        ],
        [0.28680065, 0.        , 0.73861611, 0.36930805, 0.        ,
         0.        , 0.        , 0.48559571],
        [0.2344005 , 0.79374908, 0.        , 0.        , 0.        ,
         0.39687454, 0.39687454, 0.        ]])

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
BoW = count_vect.fit_transform(docs)
tfidf_trans = TfidfTransformer()
tfidf = tfidf_trans.fit_transform(BoW)
tfidf.todense()

matrix([[0.37311881, 0.        , 0.4804584 , 0.4804584 , 0.63174505,
         0.        , 0.        , 0.        ],
        [0.28680065, 0.        , 0.73861611, 0.36930805, 0.        ,
         0.        , 0.        , 0.48559571],
        [0.2344005 , 0.79374908, 0.        , 0.        , 0.        ,
         0.39687454, 0.39687454, 0.        ]])

In [40]:
import pandas as pd

vocab = tfidf_vect.get_feature_names_out()
pd.DataFrame(tfidf.todense(), columns=vocab)

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.373119,0.0,0.480458,0.480458,0.631745,0.0,0.0,0.0
1,0.286801,0.0,0.738616,0.369308,0.0,0.0,0.0,0.485596
2,0.2344,0.793749,0.0,0.0,0.0,0.396875,0.396875,0.0




---



## 3.4 gensim 활용

In [41]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [42]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

doc_ls = [doc.split() for doc in docs] #공백으로 토큰화
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
model = TfidfModel(TDM)
tfidf = model[TDM]
tfidf[0]

[(1, 0.32718457421365993), (2, 0.32718457421365993), (3, 0.8865102981879297)]

In [43]:
from gensim.matutils import sparse2full

vocab = [id2word[i] for i in id2word.keys()]
TDM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in tfidf]
pd.DataFrame(TDM_matrix, columns=vocab)

Unnamed: 0,동물원에서,봤어,오늘,원숭이를,코끼리를,바나나를,원숭이에게,줬어
0,0.0,0.327185,0.327185,0.88651,0.0,0.0,0.0,0.0
1,0.0,0.569307,0.284654,0.0,0.771272,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.816497,0.408248,0.408248


---

# 4 LSA(Latent Semantic Analysis)를 활용한 문서 벡터

## 4.1 sklearn 활용

In [44]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

n_dim = 3

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
svd = TruncatedSVD(n_components=n_dim, algorithm='randomized', n_iter=100)
u_sigma = svd.fit_transform(DTM)
u_sigma

array([[ 1.7199249 , -0.46187608,  0.91023558],
       [ 2.42183598, -0.8655643 , -0.62089363],
       [ 1.23596373,  2.33877975, -0.05002922]])

In [46]:
import numpy as np

# svd.fit_transform(X) => U * Sigma.
# svd.singular_values_ => Sigma .
# svd.components_ => VT
df_U = pd.DataFrame(u_sigma/svd.singular_values_, columns=range(len(docs)))
df_U['문서'] = docs
df_U.set_index('문서')

Unnamed: 0_level_0,0,1,2
문서,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
오늘 동물원에서 원숭이를 봤어,0.534586,-0.182112,0.825259
오늘 동물원에서 코끼리를 봤어 봤어,0.752754,-0.341281,-0.562929
동물원에서 원숭이에게 바나나를 줬어 바나나를,0.384161,0.922151,-0.045359


In [47]:
pd.DataFrame(svd.components_, columns=count_vect.get_feature_names_out())

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.519535,0.23881,0.6341,0.40013,0.16616,0.119405,0.119405,0.23397
1,0.157225,0.727185,-0.34093,-0.206367,-0.071804,0.363592,0.363592,-0.134563
2,0.196716,-0.082248,-0.272536,0.23784,0.748216,-0.041124,-0.041124,-0.510376


## 4.2 gensim 활용

In [48]:
docs = [
    '오늘 동물원에서 원숭이를 봤어',
    '오늘 동물원에서 코끼리를 봤어 봤어',
    '동물원에서 원숭이에게 바나나를 줬어 바나나를'
]

In [49]:
doc_ls = [doc.split() for doc in docs]
doc_ls[0]

['오늘', '동물원에서', '원숭이를', '봤어']

In [50]:
from gensim import corpora
from gensim.models import LsiModel

n_dim = 3

id2word = corpora.Dictionary(doc_ls)
corpus_TDM = [id2word.doc2bow(text) for text in doc_ls]
model_LSA = LsiModel(corpus_TDM, id2word=id2word, num_topics=n_dim)
corpus_VT = model_LSA[corpus_TDM]
corpus_VT[0]

[(0, 1.7199249045928926), (1, 0.46187608261032775), (2, -0.910235577679573)]

In [51]:
from gensim.matutils import sparse2full
# LsiModel.projection.u => left singular vectors,
# LsiModel.projection.s => singular values,
# model[training_corpus] => right singular vectors.

n_topics = 3

VT = [sparse2full(doc_vector, n_topics).tolist() for doc_vector in corpus_VT]
df_VT = pd.DataFrame(VT, columns=range(len(docs)))
df_VT['문서'] = docs
df_VT = df_VT.set_index('문서')
df_VT

Unnamed: 0_level_0,0,1,2
문서,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
오늘 동물원에서 원숭이를 봤어,1.719925,0.461876,-0.910236
오늘 동물원에서 코끼리를 봤어 봤어,2.421836,0.865564,0.620894
동물원에서 원숭이에게 바나나를 줬어 바나나를,1.235964,-2.33878,0.050029


In [52]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b) :
    return dot(a, b)/(norm(a)*norm(b))

cosine_similarity(VT[0], VT[1]), cosine_similarity(VT[1], VT[2])

(0.7559289367732103, 0.1428571388938692)