## csv 파일에서 뉴스 요약 부분 가져오기

In [1]:
import pandas as pd
df = pd.read_csv('news_raw_3.csv')
df= df.dropna(how='any')

In [2]:
economy = df.loc[df['target']==0, ('summary')].to_numpy()
science = df.loc[df['target']==1, ('summary')].to_numpy()

## 한글만 추출. 숫자, 영문자, 특수기호 등 제거

In [3]:
def get_clear_text(news_numpy):
    import re
    result = []
    for news in news_numpy:
        news = str(news)
        han = re.findall("[가-힣]+", news)
        temp = " ".join(han)
        result.append(temp)
    return result

In [4]:
economy_clear_text = get_clear_text(economy)
science_clear_text = get_clear_text(science)

## 추출된 한글에서 2글자 이상의 명사만 추출


In [5]:
from konlpy.tag import Kkma
kkma = Kkma()

In [6]:
def get_nouns_by_sentence(texts):
    result = []
    for txt in texts:
        nouns = kkma.nouns(txt)
        temp_nouns = []
        for noun in nouns:
            if len(noun) > 1:
                temp_nouns.append(noun)
        result.append(' '.join(temp_nouns))
    return result

In [7]:
economy_nouns = get_nouns_by_sentence(economy_clear_text)
science_nouns = get_nouns_by_sentence(science_clear_text)

## 원핫 인코딩 후 피클로 저장

In [8]:
from tensorflow.keras import preprocessing




In [9]:
tokenizer = preprocessing.text.Tokenizer(num_words=350)
tokenizer.fit_on_texts([economy_nouns,science_nouns])

In [10]:
science_onehot = tokenizer.texts_to_matrix(science_nouns, mode='binary')

In [11]:
economy_onehot = tokenizer.texts_to_matrix(economy_nouns, mode ='binary')

In [12]:
import numpy as np

In [13]:
onehot_result = np.concatenate((economy_onehot,science_onehot), axis =0)

In [14]:
onehot_result

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
import pickle
with open('onehot_result.pickle', 'wb') as fw:
    pickle.dump(onehot_result, fw)