# 단어의 표현 (Word Representation)


기계는 문자를 그대로 인식할 수 없기때문에 숫자로 변환



# 1 원-핫 인코딩 (One-Hot Encoding)

## 1.1 직접 구현해보기

### "원숭이, 바나나, 사과" 로 원-핫 인코딩을 한다면

In [85]:
# 인코딩 대상 단어들을 담은 리스트
word_ls = ['원숭이', '바나나', '사과']

In [None]:
# 원핫 인코딩 logic

# 1. 단어(토큰)의 순서 결정
word2idx = {}
{
    '원숭이': 0,
    '바나나': 1,
    '사과': 2
}
for idx, token in enumerate(word_ls):
    if token not in word2idx.keys():
        word2idx[token] = len(word2idx)
    else:
        pass

In [None]:
idx2word = dict(enumerate(set(word_ls)))
idx2word

{0: '사과', 1: '바나나', 2: '원숭이'}

In [None]:
# 2. 단어별 벡터 생성
vector_list = []

for token in word_ls:
    temp_vector = [0] * len(word2idx)   # [0, 0, 0] 생성
    target_idx = word2idx[token]
    temp_vector[target_idx] = 1

    vector_list.append(temp_vector)

import numpy as np
np.array(vector_list)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])

In [83]:
# 원핫 인코딩 생성
def one_hot_encode(tokens):   # 토큰 리스트
    # 1. 단어-인덱스
    word2idx = {}
    for idx, token in enumerate(tokens):
        if token not in word2idx.keys():
            word2idx[token] = len(word2idx)
        else:
            pass
    # 2. 단어별 벡터 생성
    vector_list = []

    for token in tokens:
        temp_vector = [0] * len(word2idx)   # [0, 0, 0] 생성
        # temp_vector = np.zeros(len(word2idx))
        target_idx = word2idx[token]
        temp_vector[target_idx] = 1

        vector_list.append(temp_vector)

    return vector_list

In [None]:
def one_hot_encode_numpy(tokens):
    # np.zeros 활용
    return

In [86]:
one_hot_vectors = one_hot_encode(word_ls)
one_hot_vectors

[array([1., 0., 0.]), array([0., 1., 0.]), array([0., 0., 1.])]

### "코끼리"라는 단어가 추가된다면?

In [None]:
word_ls = ['원숭이','바나나','사과','코끼리']

In [None]:
one_hot_vectors = one_hot_encode(word_ls)
one_hot_vectors

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]

In [None]:
# 라벨인코딩 ?

In [None]:
# word2idx가 곧 라벨 인코딩이 된다.

## 1.3 sklearn 활용

- LabelEncoder() : 유니크한 범주형 값들을 1부터 증가하는 숫자로 인코딩
- OneHotEncoder() : 유니크한 범주형 값들을 n:1 행렬로 변환

함수명 | 설명
--|--
fit(X[, y])	| Fit OneHotEncoder to X.
fit_transform(X[, y])	| Fit OneHotEncoder to X, then transform X.
inverse_transform(X)	| Convert the back data to the original representation.
transform(X)	| Transform X using one-hot encoding.

In [None]:
# sklearn을 활용한 one-hot encoding
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# 예제 데이터 배열
values = array(word_ls)
print(values)

# 문자열에 숫자를 붙임
label_enc = LabelEncoder()
int_enc = label_enc.fit_transform(values)
print(int_enc)

# binary encode
onehot_enc = OneHotEncoder(sparse=False)
int_enc = int_enc.reshape(len(int_enc), 1) # n:1 matrix로 변환
print(int_enc)
onehot_enc = onehot_enc.fit_transform(int_enc)
print(onehot_enc)

# one-hot encoding 의 첫번째 배열을 값을 역으로 산출
inverted = label_enc.inverse_transform([argmax(onehot_enc[0, :])])
print(inverted)

['원숭이' '바나나' '사과' '코끼리']
[2 0 1 3]
[[2]
 [0]
 [1]
 [3]]
[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]
['원숭이']






---



# 2 밀집 벡터 (Dense Vector)

## 2-1 유사도 계산

### 2.1.1 유클리디언 거리(Euclidean distance)
두 벡터사이의 직선 거리. 피타고라스 정리를 생각하면 이해하기 쉬움

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/55/Euclidean_distance_2d.svg/220px-Euclidean_distance_2d.svg.png"  width="200"/>

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/795b967db2917cdde7c2da2d1ee327eb673276c0" width="350"/>

https://en.wikipedia.org/wiki/Euclidean_distance

In [None]:
word_embedding_dic = {
    '사과' : [1.0, 0.5],
    '바나나' : [0.9, 1.2],
    '원숭이' : [0.5, 1.5]
}

In [None]:
import numpy as np
# 유클리디언 거리 계산
def euclidean_dist(a, b):
    a, b = np.array(a), np.array(b)
    return np.sqrt(np.sum((a - b) ** 2))

# 사과와 바나나의 유클리디언 거리
euclidean_dist(word_embedding_dic['사과'], word_embedding_dic['바나나'])

0.7071067811865475

### 2.1.2 자카드 유사도(Jaccard index)

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Intersection_of_sets_A_and_B.svg/200px-Intersection_of_sets_A_and_B.svg.png" />

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/eaef5aa86949f49e7dc6b9c8c3dd8b233332c9e7" />

https://en.wikipedia.org/wiki/Jaccard_index

In [None]:
s1 = '대부분 원숭이는 바나나를 좋아합니다.'
s2 = '코주부 원숭이는 바나나를 싫어합니다.'

# 토큰화를 수행합니다.
token_s1 = s1.split()
token_s2 = s2.split()

# 자카드 유사도 계산
set_s1, set_s2 = set(token_s1), set(token_s2)

len(set_s1.intersection(set_s2)) / len(set_s1.union(set_s2))    # {원숭이는, 바나나를} / {대부분, 코주부, 원숭이는, 바나나를, 좋아합니다, 싫어합니다}

0.3333333333333333

### 2.1.3 코사인 유사도(Cosine Similarity)

*  두 벡터간의 유사도를 측정하는 방법 중 하나
*  두 벡터 사이의 코사인을 측정
*  0도 = 1, 90도 = 0, 180도 = -1   ==> 1에 가까울수록 유사도가 높음




<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1d94e5903f7936d3c131e040ef2c51b473dd071d" width='350'/>

https://en.wikipedia.org/wiki/Cosine_similarity

In [None]:
# 코사인 유사도 계산
def cosine_similarity(a, b):
    a, b = np.array(a), np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
# 사과와 바나나의 코사인 유사도
print(cosine_similarity(word_embedding_dic['사과'], word_embedding_dic['바나나']))
print(euclidean_dist(word_embedding_dic['사과'], word_embedding_dic['바나나']))

0.8944271909999159
0.7071067811865475


In [None]:
# 사과와 원숭이의 코사인 유사도
print(cosine_similarity(word_embedding_dic['사과'], word_embedding_dic['원숭이']))
print(euclidean_dist(word_embedding_dic['사과'], word_embedding_dic['원숭이']))

0.7071067811865475
1.118033988749895


In [None]:
# 바나나와 원숭이의 코사인 유사도
print(cosine_similarity(word_embedding_dic['바나나'], word_embedding_dic['원숭이']))
print(euclidean_dist(word_embedding_dic['바나나'], word_embedding_dic['원숭이']))

0.9486832980505138
0.5




---



# 3 TF-IDF를 활용한 단어 벡터

### 3.1 직접 구현하기

weighting schema|weight|설명
--|--|--
term frequency|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />|=토큰빈도/문서내토큰빈도
inverse document frequency|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />|=log(총문서갯수/(토큰이 등장한 문서수))

In [None]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog"

In [None]:
corpus = [d1, d2]

In [None]:
token_d1 = d1.split()
token_d2 = d2.split()

In [None]:
token_d1, token_d2

(['The', 'cat', 'sat', 'on', 'my', 'face', 'I', 'hate', 'a', 'cat'],
 ['The', 'dog', 'sat', 'on', 'my', 'bed', 'I', 'love', 'a', 'dog'])

In [None]:
word2idx = {}
for token in token_d1 + token_d2:
    if token not in word2idx:
        word2idx[token] = len(word2idx)

word2idx

{'The': 0,
 'cat': 1,
 'sat': 2,
 'on': 3,
 'my': 4,
 'face': 5,
 'I': 6,
 'hate': 7,
 'a': 8,
 'dog': 9,
 'bed': 10,
 'love': 11}

In [None]:
tdm_list = []

for tokens in [token_d1, token_d2]:
    temp_tdm = [0] * len(word2idx)

    for counting_token in word2idx:
        temp_tdm[word2idx[counting_token]] = tokens.count(counting_token)

    tdm_list.append(temp_tdm)

tdm_list

[[1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 0, 1, 1, 1, 0, 1, 0, 1, 2, 1, 1]]

In [None]:
import pandas as pd

tfidf_df = pd.DataFrame(tdm_list, index=['d1', 'd2'], columns=word2idx).T
tfidf_df

Unnamed: 0,d1,d2
The,1,1
cat,2,0
sat,1,1
on,1,1
my,1,1
face,1,0
I,1,1
hate,1,0
a,1,1
dog,0,2


In [None]:
tfidf_df['d1_tf'] = tfidf_df['d1'] / tfidf_df['d1'].sum()
tfidf_df['d2_tf'] = tfidf_df['d2'] / tfidf_df['d2'].sum()

In [None]:
(tfidf_df[['d1', 'd2']] > 0).sum(axis=1)

The     2
cat     1
sat     2
on      2
my      2
face    1
I       2
hate    1
a       2
dog     1
bed     1
love    1
dtype: int64

In [None]:
tfidf_df['idf'] = np.log10(len(corpus) / (tfidf_df[['d1', 'd2']] > 0).sum(axis=1))

In [None]:
tfidf_df

Unnamed: 0,d1,d2,d1_tf,d2_tf,idf
The,1,1,0.1,0.1,0.0
cat,2,0,0.2,0.0,0.30103
sat,1,1,0.1,0.1,0.0
on,1,1,0.1,0.1,0.0
my,1,1,0.1,0.1,0.0
face,1,0,0.1,0.0,0.30103
I,1,1,0.1,0.1,0.0
hate,1,0,0.1,0.0,0.30103
a,1,1,0.1,0.1,0.0
dog,0,2,0.0,0.2,0.30103


In [None]:
tfidf_df['d1_tfidf'] = tfidf_df['d1_tf']*tfidf_df['idf']
tfidf_df['d2_tfidf'] = tfidf_df['d2_tf']*tfidf_df['idf']

In [None]:
tfidf_df

Unnamed: 0,d1,d2,d1_tf,d2_tf,idf,d1_tfidf,d2_tfidf
The,1,1,0.1,0.1,0.0,0.0,0.0
cat,2,0,0.2,0.0,0.30103,0.060206,0.0
sat,1,1,0.1,0.1,0.0,0.0,0.0
on,1,1,0.1,0.1,0.0,0.0,0.0
my,1,1,0.1,0.1,0.0,0.0,0.0
face,1,0,0.1,0.0,0.30103,0.030103,0.0
I,1,1,0.1,0.1,0.0,0.0,0.0
hate,1,0,0.1,0.0,0.30103,0.030103,0.0
a,1,1,0.1,0.1,0.0,0.0,0.0
dog,0,2,0.0,0.2,0.30103,0.0,0.060206


In [None]:
import numpy as np

# TF-IDF 계산 logic

# token index 딕셔너리 생성
word2idx = {}
for token in token_d1 + token_d2:
    if token not in word2idx:
        word2idx[token] = len(word2idx)

# document 내 token 등장비율 TF


# 단어 - 문서 등장 횟수 IDF

# 두 값의 곱


### 3.2 sklearn 활용

- feature_extraction.text.CountVectorizer() : 문장들에서 단어의 빈도수를 벡터화

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog"
corpus = [d1, d2]
count_vect = CountVectorizer()
countv = count_vect.fit_transform(corpus)
print(countv.toarray()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(count_vect.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.

[[0 2 0 1 1 0 1 1 1 1]
 [1 0 2 0 0 1 1 1 1 1]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


- feature_extraction.text.TfidfVectorizer() : 문장들을 TF-IDF로 벡터화

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog"
corpus = [d1, d2]
tfidf_vect = TfidfVectorizer().fit(corpus)
tfidfv = tfidf_vect.transform(corpus)
print(tfidfv.toarray())
print(tfidf_vect.vocabulary_)

[[0.         0.70600557 0.         0.35300279 0.35300279 0.
  0.25116439 0.25116439 0.25116439 0.25116439]
 [0.35300279 0.         0.70600557 0.         0.         0.35300279
  0.25116439 0.25116439 0.25116439 0.25116439]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


## 3.3 gensim 활용

- corpora.Dictionary - 토큰화한 문서들 내에서 index-단어 사전을 생성
- models.TfidfModel() - TF-IDF 벡터화

In [None]:
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim import corpora

d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog"
corpus = [d1, d2]

doc_ls = [doc.split() for doc in corpus]
id2word = corpora.Dictionary(doc_ls)  # fit dictionary
corpus = [id2word.doc2bow(doc) for doc in doc_ls]  # convert corpus to BoW format

tfidf = TfidfModel(corpus)  # fit model
vector = tfidf[corpus[0]]  # apply model to the first corpus document

In [None]:
tfidf[corpus][0]

[(3, 0.8164965809277261), (4, 0.4082482904638631), (5, 0.4082482904638631)]



---



# 4 LSA(Latent Semantic Analysis)를 활용한 단어 벡터

- LSA: 잠재 의미 분석

## 4.1 sklearn 활용

- decomposition.TruncatedSVD() : 특이값분해

In [None]:
doc_ls = [
    '바나나 사과 포도 포도',
    '사과 포도',
    '포도 바나나',
    '짜장면 짬뽕 탕수육',
    '볶음밥 탕수육',
    '짜장면 짬뽕',
    '라면 스시',
    '스시',
    '가츠동 스시 소바',
    '된장찌개 김치찌개 김치',
    '김치 된장',
    '비빔밥 김치'
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

count_vect = CountVectorizer()
countv = count_vect.fit_transform(doc_ls)
print(countv)
svd = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100)
svd.fit(countv)

features = count_vect.get_feature_names_out() # 단어 집합. 1,000개의 단어가 저장됨.
for i in range(len(features)) :
    print("{} : {}".format(features[i], svd.components_[:,i]))

  (0, 6)	1
  (0, 9)	1
  (0, 15)	2
  (1, 9)	1
  (1, 15)	1
  (2, 6)	1
  (2, 15)	1
  (3, 12)	1
  (3, 13)	1
  (3, 14)	1
  (4, 14)	1
  (4, 7)	1
  (5, 12)	1
  (5, 13)	1
  (6, 5)	1
  (6, 11)	1
  (7, 11)	1
  (8, 11)	1
  (8, 0)	1
  (8, 10)	1
  (9, 4)	1
  (9, 2)	1
  (9, 1)	1
  (10, 1)	1
  (10, 3)	1
  (11, 1)	1
  (11, 8)	1
가츠동 : [2.16122745e-17 5.73900210e-16]
김치 : [ 5.04769929e-18 -3.14706672e-14]
김치찌개 : [ 8.79534109e-18 -1.25632991e-14]
된장 : [-9.49174477e-18 -9.31725769e-15]
된장찌개 : [-1.3811721e-17 -1.2629750e-14]
라면 : [-2.02507373e-17 -6.74708959e-18]
바나나 : [4.0824829e-01 1.1632686e-16]
볶음밥 : [-1.55051404e-25  1.27737006e-01]
비빔밥 : [-5.60710388e-19 -9.44494269e-15]
사과 : [ 4.08248290e-01 -5.90522109e-18]
소바 : [2.85510871e-17 5.94296098e-16]
스시 : [-2.24857946e-17  6.05092746e-16]
짜장면 : [-7.42884921e-25  6.12024764e-01]
짬뽕 : [-7.42884921e-25  6.12024764e-01]
탕수육 : [-5.87838718e-25  4.84287758e-01]
포도 : [ 8.16496581e-01 -5.52108191e-17]


## 4.2 gensim 활용

- models.LsiModel() : LSA 모델

In [None]:
doc_ls = [
    '바나나 사과 포도 포도',
    '사과 포도',
    '포도 바나나',
    '짜장면 짬뽕 탕수육',
    '볶음밥 탕수육',
    '짜장면 짬뽕',
    '라면 스시',
    '스시',
    '가츠동 스시 소바',
    '된장찌개 김치찌개 김치',
    '김치 된장',
    '비빔밥 김치'
]
doc_ls = [d.split() for d in doc_ls]

In [None]:
from gensim import corpora
from gensim.models import LsiModel

id2word = corpora.Dictionary(doc_ls) #사전 구축
corpus = [id2word.doc2bow(text) for text in doc_ls] # 코퍼스 생성
lsi = LsiModel(corpus, id2word=id2word, num_topics=2) #LSA 모델

for i in id2word.keys() :
    print("{} : {}".format(id2word[i], lsi.projection.u[i]))

바나나 : [ 4.08248290e-01 -1.32657032e-16]
사과 : [ 4.08248290e-01 -4.87601506e-17]
포도 : [8.16496581e-01 4.49584406e-18]
짜장면 : [3.42182486e-17 6.12024764e-01]
짬뽕 : [1.66380209e-17 6.12024764e-01]
탕수육 : [3.70944878e-17 4.84287758e-01]
볶음밥 : [3.41118807e-17 1.27737006e-01]
라면 : [ 1.66525861e-17 -1.54978375e-15]
스시 : [ 5.29326512e-17 -4.37412528e-15]
가츠동 : [-2.33060589e-17 -1.58851742e-15]
소바 : [ 1.81573915e-17 -1.78530682e-15]
김치 : [ 8.63956251e-17 -1.07987325e-14]
김치찌개 : [ 4.76279447e-17 -4.60721302e-15]
된장찌개 : [ 5.60200957e-17 -4.57386577e-15]
된장 : [ 3.01957768e-17 -3.29104242e-15]
비빔밥 : [ 1.96977131e-17 -2.93419101e-15]
