# 한국어 비슷한 단어 모델 학습

1. DataFrame을 만든다 -> crawling을 하던 (crawl_query) 또는 저장된 data를 불러와서 dataframe으로 만든다
2. model_train을 한다. -> 결과: tokenized 단어 list + word2vec학습된 모델
3. create_tensors를 하면 모델을 tensor로 바꾸고 저장한다.

In [1]:
import pandas as pd
import numpy as np
from konlpy.tag import Okt
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
import os
import sys
import logging
import argparse
import gensim
from gensim import utils


logger = logging.getLogger(__name__)



def word2vec2tensor(word2vec_model, tensor_filename, binary=False):
    """Convert file in Word2Vec format and writes two files 2D tensor TSV file.

    File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words.

    Parameters
    ----------
    word2vec_model_path : str
        Path to file in Word2Vec format.
    tensor_filename : str
        Prefix for output files.
    binary : bool, optional
        True if input file in binary format.

    """
    model = word2vec_model
    outfiletsv = tensor_filename + '_tensor.tsv'
    outfiletsvmeta = tensor_filename + '_metadata.tsv'

    with utils.open(outfiletsv, 'wb') as file_vector, utils.open(outfiletsvmeta, 'wb') as file_metadata:
        for word in model.index2word:
            file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
            vector_row = '\t'.join(str(x) for x in model[word])
            file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))

    logger.info("2D tensor file saved to %s", outfiletsv)
    logger.info("Tensor metadata file saved to %s", outfiletsvmeta)
    
def model_train(dataframe, content_col, size=100, window=5):
    okt=Okt()
    result = []
    print('loading ', end='')
    for index, row, in df.iterrows():
        if (index % 1000 == 0):
            print('. ', end='')
        try:
            tokenlist = okt.pos(row[content_col], stem=True, norm=True) # 단어 토큰화
        except:
            print(index, end='')
            continue
        temp=[]
        for word in tokenlist:
            if word[1] in ["Noun"]: # 명사일 때만
                temp.append((word[0])) # 해당 단어를 저장함

        if temp: # 만약 이번에 읽은 데이터에 명사가 존재할 경우에만
            result.append(temp) # 결과에 저장
    print('\n\nFinished!')
    
    model = Word2Vec(result, size=size, window=window, min_count=5, workers=4, sg=0)
    return result, model

def print_similar(model_, query):
    try:
        model_result = model_.wv.most_similar(query)
        print(model_result)
    except KeyError:
        print('{} not in model vocabulary. Please try another query.'.format(query))
    
def create_tensors(model, output):
    word2vec2tensor(model.wv, output)

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime


def crawler(maxpage,query,s_date,e_date):
    s_from = s_date.replace(".","")
    e_to = e_date.replace(".","")
    page = 1
    maxpage_t =(int(maxpage)-1)*10+1 # 11= 2페이지 21=3페이지 31=4페이지 ...81=9페이지 , 91=10페이지, 101=11페이지
    #f = open(RESULT_PATH + filename, 'w', encoding='utf-8-sig')
    df = pd.DataFrame(columns=['date', 'title', 'contents'])
    results_list = []

    while page < maxpage_t:
        print(page, 'loading', end='')
        url = "https://search.naver.com/search.naver?where=news&query=" + query + "&sort=0&ds=" + s_date + "&de=" + e_date + "&nso=so%3Ar%2Cp%3Afrom" + s_from + "to" + e_to + "%2Ca%3A&start=" + str(page)
        req = requests.get(url)
        # print(url)
        cont = req.content
        soup = BeautifulSoup(cont, 'html.parser')
        #print(soup)
        for urls in soup.select("._sp_each_url"):
            try :
                #print(urls["href"])
                if urls["href"].startswith("https://news.naver.com"):
                    #print(urls["href"])
                    news_detail = get_news(urls["href"])
                    # pdate, pcompany, title, btext
                    results_list.append((news_detail[1], news_detail[0], news_detail[2])) # date, title, contents
                    #f.write("{}\t{}\t{}\t{}\t{}\n".format(news_detail[1], news_detail[4], news_detail[0], news_detail[2],news_detail[3])) # new style
            except Exception as e:
                # print(e)
                continue
        page += 10
    df = pd.DataFrame(results_list, columns=['date', 'title', 'contents'])
    print('\nFinished!\n')
    return df

def get_news(n_url):
    print('.', end='')
    news_detail = []
    breq = requests.get(n_url)
    bsoup = BeautifulSoup(breq.content, 'html.parser')
    title = bsoup.select('h3#articleTitle')[0].text #대괄호는 h3#articleTitle 인 것중 첫번째 그룹만 가져오겠다.
    news_detail.append(title)
    pdate = bsoup.select('.t11')[0].get_text()[:11]
    news_detail.append(pdate)
    _text = bsoup.select('#articleBodyContents')[0].get_text().replace('\n', " ")
    btext = _text.replace("// flash 오류를 우회하기 위한 함수 추가 function _flash_removeCallback() {}", "")
    news_detail.append(btext.strip())
    news_detail.append(n_url)
    pcompany = bsoup.select('#footer address')[0].a.get_text()
    news_detail.append(pcompany)
    # news_detail [title, pdate, btext, n_url, pcompany]
    return news_detail

def crawl_query(max_pages, query,s_date='2020.06.01', e_date='2020.06.02'):
    maxpage = str(max_pages)
    return crawler(maxpage,query,s_date,e_date)

def crawl_n_save(max_pages, query,s_date='2020.06.01', e_date='2020.06.02'):
    maxpage = str(max_pages)
    df_temp = crawler(maxpage,query,s_date,e_date)
    df_temp.to_pickle("./data_{}.pkl".format(query))

## 예시 코드

In [3]:
df = crawl_query(2, '코로나')

1 loading............................
Finished!



In [4]:
result, model = model_train(df, 'contents')

loading . 

Finished!


In [13]:
print_similar(model, '감염')

[('코로나', 0.9972690939903259), ('백신', 0.9965233206748962), ('증가', 0.9962301850318909), ('물가', 0.996173620223999), ('중국', 0.9961512684822083), ('개발', 0.9961411952972412), ('이용', 0.9958362579345703), ('시간', 0.9957759380340576), ('진단', 0.9957061409950256), ('미국', 0.9956570267677307)]
