# Word2Vec Embedding model

- https://radimrehurek.com/gensim_3.8.3/models/keyedvectors.html

## Configuration

In [1]:
# 구글 코랩 환경에서 실행, 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
path = '/content/drive/MyDrive/Transformer/data/'

In [3]:
# 한자 변환
!pip install hanja

Collecting hanja
[?25l  Downloading https://files.pythonhosted.org/packages/56/97/ce51b5c771e7c9a673568232125e587cbc378ff1dd13057f237bedcd71e8/hanja-0.13.3.tar.gz (120kB)
[K     |████████████████████████████████| 122kB 8.1MB/s 
[?25hCollecting pyyaml==5.1.2
[?25l  Downloading https://files.pythonhosted.org/packages/e3/e8/b3212641ee2718d556df0f23f78de8303f068fe29cdaa7a91018849582fe/PyYAML-5.1.2.tar.gz (265kB)
[K     |████████████████████████████████| 266kB 26.3MB/s 
Collecting pytest-cov
  Downloading https://files.pythonhosted.org/packages/e3/1a/6affecd2344efee7f2487fac82242474cbac09f9e04929da5944907baf11/pytest_cov-2.11.1-py2.py3-none-any.whl
Collecting coverage>=5.2.1
[?25l  Downloading https://files.pythonhosted.org/packages/16/e0/fc9f7bd9b84e6b41d0aad1a113e36714aac0c0a9b307aca5f9af443bc50f/coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl (242kB)
[K     |████████████████████████████████| 245kB 34.7MB/s 
Building wheels for collected packages: hanja, pyyaml
  Building wheel fo

In [4]:
import pandas as pd
import gensim, logging
import multiprocessing
import os

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load dataset

In [14]:
os.listdir('/content/drive/MyDrive/Transformer/data/')

['5. 알고리즘', 'Day1', 'day2', 'data']

In [16]:
train_df = pd.read_csv(os.path.join(path,'news_train.csv'), sep='\t', index_col=0, encoding='utf-8')
test_df = pd.read_csv(os.path.join(path, 'news_test.csv'), sep='\t', index_col=0, encoding='utf-8')

print(train_df.shape)
print(test_df.shape)

(5000, 5)
(500, 5)


In [17]:
train_df.head()

Unnamed: 0,filename,date,NewsPaper,Topic,News
52,NLRW1900000022,20101209,경기일보,IT/과학,<p> 공공 SW사업 불공정 하도급 관행 개선되나 </p> <p> 공공 소프트웨어(...
3529,NWRW1900000016,20140416,동아일보사,미용/건강,"<p> 밀레, 고탄성 소재로 발이 편안… 엠리밋, 천연 방충성분 넣은 봄재킷 </p..."
6016,NLRW1900000021,20090116,경기일보,스포츠,<p> <경마코너> 새해 고객 환급률 73%로 </p> <p> 2009년 과천 서울...
8046,NLRW1900000021,20090826,경기일보,정치,<p> 도의회 예결특위 선정 ‘잡음’ </p> <p> 경기도의회가 7대 마지막 예산...
1109,NLRW1900000062,20180516,경인일보,경제,<p> 한국지엠·부품 협력업체 ‘상생’ 강화 </p> <p> ‘경영 현황 설명회’ ...


## Build list of sentences

In [18]:
from typing import List

import pandas as pd
import itertools
import re, hanja
 
def load_sentences(df: pd.DataFrame = None) -> List[str]:
    """
    개별 뉴스에서 </p>와 <p>로 구분되어 있는 여러 문장을 문장으로 쪼갬
    :param df: 전처리 대상 데이터 프레임
    :return: 문장이 분리된 데이터 프레임
    """
    all_sentences = []
    for news in df['News'].values:
        sentences_in_news = news.split(' </p> <p> ')
        sentences_in_news = [x.replace('</p>', '').replace('<p>','').strip() for x in sentences_in_news]
        all_sentences.append(sentences_in_news)

    merged = list(itertools.chain(*all_sentences))

    return merged

In [19]:
train_sentences = load_sentences(train_df)
test_sentences = load_sentences(test_df)

print(len(train_sentences), len(test_sentences))

50809 5139


## Pre-process data & Tokenize

In [20]:
# 형태소 기반 토크나이징 (Konlpy)
!python3 -m pip install konlpy
# mecab (ubuntu, mac 기준)
# 다른 os 설치 방법 및 자세한 내용은 다음 참고: https://konlpy.org/ko/latest/install/#id1
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.2MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 45.2MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 14.4MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: JPype1, 

In [21]:
from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma

def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    else:
        tokenizer = Kkma()
        
    return tokenizer

def tokenize(tokenizer_name, original_sent, pos=False):
    tokenizer = get_tokenizer(tokenizer_name)
    sentence = original_sent.replace('\n', '').strip()
    if pos:
        tokens = tokenizer.pos(sentence)
        tokens = [morph + "/" + tag for morph, tag in tokens]
    else:
      # tokenizer.nouns(sentence) -> 명사만 추출
        tokens = tokenizer.morphs(sentence)
        
    # tokenized_sent = ' '.join(post_process(tokens))
    tokenized_sent = ' '.join(tokens)
    
    return tokenized_sent

In [22]:
removal_list =  "‘, ’, ◇, ‘, ”,  ’, ', ·, \“, ·, △, ●,  , ■, (, ), \", >>, `, /, -,∼,=,ㆍ<,>, .,?, !,【,】, …, ◆,%"

EMAIL_PATTERN = re.compile(r'''(([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+)(\.[a-zA-Z]{2,4}))''', re.VERBOSE)
URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.VERBOSE)
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)

In [23]:
def cleansing_other(sentence: str = None) -> str:
    """
    문장을 전처리 (이메일, URL, 공백 등 제거) 하는 함수
    :param sentence: 전처리 대상 문장
    :return: 전처리 완료된 문장
    """
    sentence = re.sub(EMAIL_PATTERN, ' ', sentence)
    sentence = re.sub(URL_PATTERN, ' ', sentence)
    sentence = re.sub(MULTIPLE_SPACES, ' ', sentence)
    sentence = sentence.replace(", )", "")
    
    return sentence

def cleansing_chinese(sentence: str = None) -> str:
    """
    한자를 변환하는 전처리를 하는 함수
    :param sentence: 전처리 대상 문장
    :return: 전처리 완료된 문장
    """
    # chinese character를 앞뒤로 괄호가 감싸고 있을 경우, 대부분 한글 번역임
    sentence = re.sub("\([\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]+\)", "", sentence)
    # 다른 한자가 있다면 한글로 치환
    if re.search("[\u2E80-\u2FD5\u3190-\u319f\u3400-\u4DBF\u4E00-\u9FCC\uF900-\uFAAD]", sentence) is not None:
        sentence = hanja.translate(sentence, 'substitution')

    return sentence

def cleansing_special(sentence: str = None) -> str:
    """
    특수문자를 전처리를 하는 함수
    :param sentence: 전처리 대상 문장
    :return: 전처리 완료된 문장
    """
    sentence = re.sub("[.,\'\"’‘”“!?]", "", sentence)
    sentence = re.sub("[^가-힣0-9a-zA-Z\\s]", " ", sentence)
    sentence = re.sub("\s+", " ", sentence)
    
    sentence = sentence.translate(str.maketrans(removal_list, ' '*len(removal_list)))
    sentence = sentence.strip()
    
    return sentence

def cleansing_numbers(sentence: str = None) -> str:
    """
    숫자를 전처리(delexicalization) 하는 함수
    :param sentence: 전처리 대상 문장
    :return: 전처리 완료된 문장
    """
    
    sentence = re.sub('[0-9]+', 'NUM', sentence)
    sentence = re.sub('NUM\s+', "NUM", sentence)
    sentence = re.sub('[NUM]+', "NUM", sentence)
    
    return sentence

def preprocess_sent(sentence: str = None) -> str:
    """
    모든 전처리를 수행 하는 함수
    :param sentence: 전처리 대상 문장
    :return: 전처리 완료된 문장
    """
    sent_clean = sentence
    sent_clean = cleansing_other(sent_clean)
    sent_clean = cleansing_chinese(sent_clean)
    sent_clean = cleansing_special(sent_clean)
    sent_clean = cleansing_numbers(sent_clean)
    sent_clean = re.sub('\s+', ' ', sent_clean)
    sent_clean = tokenize('mecab', sent_clean)

    return sent_clean

In [24]:
# 전처리 예시
new_sents = []
original_sents = train_sentences[-50:]
for sent in original_sents:
    new_sent = preprocess_sent(sent)
    new_sents.append(new_sent)

for ori, new in zip(original_sents, new_sents):
    print("----------")
    print(ori)
    print(new)

----------
무상 의료
무상 의료
----------
◆ 허윤정 민주당 보건복지 전문위원 '건강권 보장은 국가의 기본 의무'
허윤정 민주당 보건 복지 전문 위원 건강 권 보장 은 국가 의 기본 의무
----------
보건복지부는 지난 11일 당뇨환자와 같은 만성질환자들이 대형병원을 이용할 때 지불하는 약값의 60%(현행 30%)로 높여 받아 대형병원에 환자가 몰리는 것을 막겠다는 '경증환자 집중 완화대책'을 발표했다. 환자들이 동네 의원보다 대형병원에 가는 것은 동네 의원 의료 서비스에 만족하지 못하기 때문이다. 그래서 비용이 더 많이 드는데도 더 멀리 있는 대형병원으로 찾아가는 것이다. 주치의 제도와 같은 정책을 병행하여 의료전달체계 개선을 추진해야 하는데 환자에게만 부담을 떠넘기는 것은 대형병원의 환자 집중도를 줄이지 못하면서 서민들의 부담만 늘어나게 할 위험이 있어 반대한다.
보건복지부 는 지난 NUM 일 당뇨 환자 와 같 은 만성 질 환자 들 이 대형 병원 을 이용 할 때 지불 하 는 약값 의 NUM 현행 NUM 로 높여 받 아 대형 병원 에 환자 가 몰리 는 것 을 막 겠 다는 경증 환자 집중 완화 대책 을 발표 했 다 환자 들 이 동네 의원 보다 대형 병원 에 가 는 것 은 동네 의원 의료 서비스 에 만족 하 지 못하 기 때문 이 다 그래서 비용 이 더 많이 드 는 데 도 더 멀리 있 는 대형 병원 으로 찾아가 는 것 이 다 주치 의 제도 와 같 은 정책 을 병행 하 여 의료 전달 체계 개선 을 추진 해야 하 는데 환자 에게 만 부담 을 떠넘기 는 것 은 대형 병원 의 환자 집 중도 를 줄이 지 못하 면서 서민 들 의 부담 만 늘어나 게 할 위험 이 있 어 반대 한다
----------
의사와 환자 간 신뢰를 기반으로 의료전달체계를 개편하기 위해서는 건강보험 가입자인 국민들의 동의와 합의가 필요하다. '돈 없어 질병 치료를 포기하는 일이 없도록 하고, 질병 때문에 가계 파탄은 막자'는 것이 일부 언론이 비판하는 포퓰리즘이라면 기꺼이 수용하겠

## Save to text file

In [25]:
from typing import List
from tqdm import tqdm

def write_to_txt(sentences: List[str] = None,
                 filename: str = None):
    """
    전처리 완료된 데이터를 txt파일로 저장하는 함수
    :param sentences: 전처리 완료된 문장 리스트
    :param filename: 저장 파일 경로
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for sent in tqdm(sentences):
            f.write(preprocess_sent(sent)+'\n')
    f.close()
    print(f'Data saved at {filename}')

In [26]:
write_to_txt(train_sentences, os.path.join(path, 'news_sentence_train.txt'))
write_to_txt(test_sentences, os.path.join(path, 'news_sentence_test.txt'))

100%|██████████| 50809/50809 [01:06<00:00, 764.94it/s]
  1%|▏         | 76/5139 [00:00<00:06, 759.25it/s]

Data saved at /content/drive/MyDrive/Transformer/data/news_sentence_train.txt


100%|██████████| 5139/5139 [00:06<00:00, 772.92it/s]

Data saved at /content/drive/MyDrive/Transformer/data/news_sentence_test.txt





## Read all sentences

In [27]:
class SentenceReader:
    def __init__(self, filepath):
        self.filepath = filepath

    def __iter__(self):
        for line in open(self.filepath):
            yield line.split(' ')

In [28]:
sentences = SentenceReader(os.path.join(path, 'news_sentence_train.txt'))

## Check most frequent words

In [29]:
from collections import defaultdict

word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

65271

In [30]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['NUM', '이', '을', '는', '의', '에', '하', '를', '은', '다\n']

## Train word2vec model

In [31]:
import multiprocessing
from gensim.models import Word2Vec
# scikit learn -> svm logitstic regression

In [32]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

### Parameters

* `min_count` <font color='purple'>=</font> <font color='green'>int</font> - Ignores all words with total absolute frequency lower than this - (2, 100)


* `window` <font color='purple'>=</font> <font color='green'>int</font> - The maximum distance between the current and predicted word within a sentence. E.g. `window` words on the left and `window` words on the left of our target - (2, 10)


* `size` <font color='purple'>=</font> <font color='green'>int</font> - Dimensionality of the feature vectors. - (50, 300)


* `sample` <font color='purple'>=</font> <font color='green'>float</font> - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial.  - (0, 1e-5)


* `alpha` <font color='purple'>=</font> <font color='green'>float</font> - The initial learning rate - (0.01, 0.05)


* `min_alpha` <font color='purple'>=</font> <font color='green'>float</font> - Learning rate will linearly drop to `min_alpha` as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00


* `negative` <font color='purple'>=</font> <font color='green'>int</font> - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)


* `workers` <font color='purple'>=</font> <font color='green'>int</font> - Use these many worker threads to train the model (=faster training with multicore machines)

In [33]:
w2v_model = Word2Vec(min_count=10,
                     window=5,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

### Build Vocab

In [34]:
# 안녕 하 세요 

In [35]:
from time import time
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

2021-03-25 01:17:56,829 : INFO : collecting all words and their counts
2021-03-25 01:17:56,834 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-25 01:17:56,961 : INFO : PROGRESS: at sentence #10000, processed 502395 words, keeping 28940 word types
2021-03-25 01:17:57,082 : INFO : PROGRESS: at sentence #20000, processed 1002740 words, keeping 41603 word types
2021-03-25 01:17:57,207 : INFO : PROGRESS: at sentence #30000, processed 1515230 words, keeping 50887 word types
2021-03-25 01:17:57,331 : INFO : PROGRESS: at sentence #40000, processed 2016441 words, keeping 58363 word types
2021-03-25 01:17:57,452 : INFO : PROGRESS: at sentence #50000, processed 2514807 words, keeping 64773 word types
2021-03-25 01:17:57,464 : INFO : collected 65271 word types from a corpus of 2553823 raw words and 50809 sentences
2021-03-25 01:17:57,465 : INFO : Loading a fresh vocabulary
2021-03-25 01:17:57,505 : INFO : effective_min_count=10 retains 14046 unique words (21% of 

Time to build vocab: 0.05 mins


### Train model

* `total_examples` <font color='purple'>=</font> <font color='green'>int</font> - Count of sentences;
* `epochs` <font color='purple'>=</font> <font color='green'>int</font> - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [36]:
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

2021-03-25 01:18:03,068 : INFO : training model with 1 workers on 14046 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=5
2021-03-25 01:18:04,101 : INFO : EPOCH 1 - PROGRESS: at 12.53% examples, 136079 words/s, in_qsize 1, out_qsize 0
2021-03-25 01:18:05,107 : INFO : EPOCH 1 - PROGRESS: at 25.28% examples, 139604 words/s, in_qsize 1, out_qsize 0
2021-03-25 01:18:06,126 : INFO : EPOCH 1 - PROGRESS: at 38.67% examples, 141494 words/s, in_qsize 1, out_qsize 0
2021-03-25 01:18:07,145 : INFO : EPOCH 1 - PROGRESS: at 51.45% examples, 141522 words/s, in_qsize 1, out_qsize 0
2021-03-25 01:18:08,156 : INFO : EPOCH 1 - PROGRESS: at 63.73% examples, 141051 words/s, in_qsize 1, out_qsize 0
2021-03-25 01:18:09,175 : INFO : EPOCH 1 - PROGRESS: at 76.66% examples, 141088 words/s, in_qsize 1, out_qsize 0
2021-03-25 01:18:10,190 : INFO : EPOCH 1 - PROGRESS: at 89.55% examples, 141214 words/s, in_qsize 1, out_qsize 0
2021-03-25 01:18:10,978 : INFO : worker thread finished; a

Time to train the model: 1.34 mins


In [37]:
# 모델을 더 이상 학습시키지 않는다면, init_sims로 모델을 memory-efficient하게 만들 수 있다
w2v_model.init_sims(replace=True)

2021-03-25 01:19:23,205 : INFO : precomputing L2-norms of word weight vectors


In [38]:
w2v_model.save(os.path.join(path, 'word2vec_model'))

2021-03-25 01:19:23,335 : INFO : saving Word2Vec object under /content/drive/MyDrive/Transformer/data/word2vec_model, separately None
2021-03-25 01:19:23,336 : INFO : not storing attribute vectors_norm
2021-03-25 01:19:23,337 : INFO : not storing attribute cum_table
2021-03-25 01:19:23,764 : INFO : saved /content/drive/MyDrive/Transformer/data/word2vec_model


## Explore word2vec model

In [39]:
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load(os.path.join(path, 'word2vec_model'), mmap='r')

2021-03-25 01:45:41,079 : INFO : loading Word2VecKeyedVectors object from /content/drive/MyDrive/Transformer/data/word2vec_model
2021-03-25 01:45:41,448 : INFO : loading wv recursively from /content/drive/MyDrive/Transformer/data/word2vec_model.wv.* with mmap=r
2021-03-25 01:45:41,449 : INFO : setting ignored attribute vectors_norm to None
2021-03-25 01:45:41,450 : INFO : loading vocabulary recursively from /content/drive/MyDrive/Transformer/data/word2vec_model.vocabulary.* with mmap=r
2021-03-25 01:45:41,454 : INFO : loading trainables recursively from /content/drive/MyDrive/Transformer/data/word2vec_model.trainables.* with mmap=r
2021-03-25 01:45:41,455 : INFO : setting ignored attribute cum_table to None
2021-03-25 01:45:41,457 : INFO : loaded /content/drive/MyDrive/Transformer/data/word2vec_model


### A is Most Similar to B

In [40]:
w2v_model.wv.most_similar(positive=["청년"])

2021-03-25 01:45:49,049 : INFO : precomputing L2-norms of word weight vectors


[('일자리', 0.7152606248855591),
 ('희망', 0.6957262754440308),
 ('기업인', 0.6854716539382935),
 ('창업', 0.6646683216094971),
 ('취업', 0.6300194263458252),
 ('구직자', 0.6131166219711304),
 ('공동체', 0.6078829169273376),
 ('장학', 0.6064194440841675),
 ('중소기업', 0.5992727875709534),
 ('창출', 0.5816690921783447)]

In [41]:
w2v_model.wv.most_similar(positive=["청소년"])

[('아동', 0.7997103333473206),
 ('수련', 0.7973304390907288),
 ('학부모', 0.7972500324249268),
 ('성인', 0.7760043144226074),
 ('소아', 0.7754462957382202),
 ('어린이', 0.7660644054412842),
 ('학생', 0.7627272605895996),
 ('노인', 0.7611141800880432),
 ('돌봄', 0.7559870481491089),
 ('교실', 0.7471286058425903)]

In [42]:
w2v_model.wv.most_similar(positive=["강원도"])

[('원주', 0.8089436292648315),
 ('체육회', 0.7999961376190186),
 ('울주군', 0.7968194484710693),
 ('북대', 0.7844271063804626),
 ('고양', 0.781438410282135),
 ('강릉', 0.7805231213569641),
 ('도생', 0.7804223895072937),
 ('군청', 0.7772335410118103),
 ('춘천', 0.7746467590332031),
 ('경', 0.7731277346611023)]

### Similarity between A and B

In [43]:
w2v_model.wv.similarity("청년", '청소년')

0.46824113

In [44]:
w2v_model.wv.similarity("강원도", '도내')

0.3854533

In [45]:
w2v_model.wv.similarity("강원도", '원주')

0.80894375

In [46]:
w2v_model.wv.similarity('서울', '춘천')

0.6873325

In [47]:
w2v_model.wv.similarity('서울', '부산')

0.7223158

### Analogy difference

In [None]:
w2v_model.wv.most_similar(positive=["취업", "청년"], negative=["게임"], topn=5)

In [None]:
w2v_model.wv.most_similar(positive=["춘천", "서울"], negative=["일본"], topn=5)