# 문장 단위 Word2Vec

- Ubuntu 18.04 LTS (Mecab 때문에 윈도우에서는 안돌아갑니다!)

In [1]:
import pandas as pd
import numpy as np
import re
import os
from konlpy.tag import Mecab, Kkma
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## 데이터 전처리

**중복 제거**

In [2]:
data = pd.read_csv("./data/Raw_Data_전체.csv").dropna().drop_duplicates("Content") # 중복 제거

**한글과 띄어쓰기만 남김**

In [3]:
hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') # 한글과 띄어쓰기를 제외한 모든 글자 삭제

**카테고리 정렬, 불필요한 컬럼 드랍**

In [4]:
data = data.assign(
    Content = data.Content.apply(lambda x: x.strip()[1:-1]).apply(lambda x: hangul.sub("", x)),
    Category = data.Category.apply(lambda x: x.strip()).replace({
        "외교/통일/국토":"외교/통일/국방", "안전/환제":"안전/환경", "보건복업지":"보건복지"
    })).drop(columns = ["Unnamed: 0", "Unnamed: 0.1"]).dropna() # 간단한 전처리

**유의미하게 분석 가능한 수준의 문서만 남김**

In [12]:
data = data[(data.Content.apply(len) >= 30) & (data.Content.apply(lambda x: len(x.split(" ")) >= 5))] # 30글자 5어절 이상만

In [13]:
data.to_csv("data/data.csv", index=False, encoding = "utf-8")

In [2]:
data = pd.read_csv("data/data.csv", encoding= 'utf-8')

In [5]:
def filterDuplicates(df):
    tfidf_matrix = TfidfVectorizer().fit_transform(df.Content.values)
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    result = pd.DataFrame(cosine_sim).round(3)
    df = df[~(result > 0.95).duplicated().values]
    return df

In [3]:
data.shape

(383169, 5)

## 1. 모델 훈련

- 문장 구분(Kkma)
- 명사 추출(Mecab)
- 모델 훈련

In [3]:
kkma, mecab = Kkma(), Mecab()

In [3]:
def modelTrain(cat, sg, path):
    if not os.path.exists(path):
        os.mkdir(path) # 모델 저장할 경로 생성
    df = data[data.Category == cat] # 카테고리 필터
    words = df.Content.apply(kkma.sentences).apply(lambda x: list(map(mecab.nouns, x))).sum() # 문장 구분, 명사 추출
    model = Word2Vec(words, window=3, min_count=3, size=100, sg=sg) # 모델 훈련
    model.init_sims(replace=True) # 불필요한 메모리 deload
    filename = cat.replace("/","")
    model.save(f"{path}/{filename}.model") # 모델 저장

In [5]:
pd.Series(data.Category.unique()).apply(modelTrain, sg=1, path="./models/skipgram")

java.lang.NullPointerException: java.lang.NullPointerException

In [6]:
modelTrain("농산어촌", 1, "./models/skipgram")

In [7]:
model = Word2Vec.load("./models/skipgram/농산어촌.model")

In [8]:
model.wv.most_similar("정의", topn = 50)

[('평등', 0.9774158000946045),
 ('촛불', 0.9771146178245544),
 ('응원', 0.9715062379837036),
 ('인권', 0.9708581566810608),
 ('주권', 0.9688318371772766),
 ('약자', 0.9670417308807373),
 ('권력', 0.9656468629837036),
 ('눈치', 0.9646018147468567),
 ('역사', 0.9641106128692627),
 ('정치', 0.9639952182769775),
 ('존중', 0.9637541174888611),
 ('민주', 0.9628485441207886),
 ('외교', 0.960398256778717),
 ('건지', 0.960180401802063),
 ('소통', 0.9589465856552124),
 ('현명', 0.9579609632492065),
 ('청산', 0.9579111337661743),
 ('탄생', 0.9574794769287109),
 ('사랑', 0.9562893509864807),
 ('통일', 0.9562503099441528),
 ('추구', 0.9530857801437378),
 ('의지', 0.9530041217803955),
 ('민국', 0.9526025056838989),
 ('외면', 0.9518930315971375),
 ('간첩', 0.9510756731033325),
 ('노고', 0.9509695768356323),
 ('시절', 0.9506173729896545),
 ('이거', 0.9502923488616943),
 ('기원', 0.9502061605453491),
 ('고충', 0.9496711492538452),
 ('행동', 0.9495805501937866),
 ('가슴', 0.949274480342865),
 ('조선', 0.9489424228668213),
 ('좌파', 0.9479025602340698),
 ('용기', 0.9474418