### 코드 실행 시 주의사항

In [None]:
# def making_df():
    # df = pd.read_csv('./news_temp.csv') << 여기서 전처리 진행할 파일 경로 설정
    # df = df.dropna() # nan/none값 제거(네이버뉴스는 데이터 클렌징 할 때 처리했으므로 이 줄 삭제, 의사록/채권보고서는 ???)

### 실행 코드

In [6]:
import pandas as pd
from ekonlpy.tag import Mecab

class DataPreprocessing:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)
        self.mecab = Mecab()
        self.stopPos = ['NNP', 'NNB', 'NNBC', 'NR', 'NP',
                        'VX', 'VCP', 'MM', 'MAJ', 'IC', 'JKS',
                        'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ',
                        'JX', 'JC', 'EP', 'EF', 'EC', 'ETN', 'ETM',
                        'XPN', 'XSN', 'XSV', 'XSA', 'XR', 'SF', 'SE',
                        'SSO', 'SSCSY', 'SSC', 'SC', 'SY', 'SN']
    
    def making_df(self):
        self.df = self.df.dropna()
        data = self.df.groupby('date').agg({'title': '/'.join, 'contents': '/'.join}).reset_index()
        melted_data = pd.melt(data, id_vars=['date'], value_vars=['title', 'contents'], var_name='column', value_name='title_contents')
        data = melted_data.groupby('date')['title_contents'].agg('/'.join).reset_index()
        return data
    
    def pos_tag(self, text):
        return self.mecab.pos(text)
    
    def rm_stopPos(self, text):
        return [word for word in text if word[1] not in self.stopPos]
    
    def synonyms(self, text):
        return self.mecab.replace_synonyms(text)
    
    def lemmas(self, text):
        return self.mecab.lemmatize(text)

    def preprocess_data(self):
        total_news = self.making_df()
        total_news = total_news.assign(pos_tagging="")
        total_news['pos_tagging'] = total_news['title_contents'].apply(self.pos_tag)
        total_news['remove_stopPos'] = total_news['pos_tagging'].apply(self.rm_stopPos)
        total_news['synonyms'] = total_news['remove_stopPos'].apply(self.synonyms)
        total_news['lemmas'] = total_news['synonyms'].apply(self.lemmas)
        return total_news[['date', 'lemmas']]

In [7]:
data_processor = DataPreprocessing('./news_temp.csv')
preprocessed_data = data_processor.preprocess_data()

In [14]:
preprocessed_data

Unnamed: 0,date,lemmas
0,2017.01.01.,"[(해전, NNG), (순신, NNG), (경제수장, NNG), (신년사, NNG)..."
1,2017.01.02.,"[(대출, NNG), (신상품, NNG), (출시, NNG), (최대, NNG), ..."
2,2017.01.03.,"[(지자체, NNG), (음식점, NNG), (인증제도, NNG), (올해, NNG..."
3,2017.01.04.,"[(불법, NNG), (노점, NNG), (사라지, VV), (휴식, NNG), (..."
4,2017.01.05.,"[(올해, NNG), (가구, NNG), (주거, NNG), (지원, NNG), (..."
5,2017.01.06.,"[(미국, NNG), (월간, NNG), (신규, NNG), (고용, NNG), (..."
6,2017.01.07.,"[(브라질, NNG), (산업, NNG), (생산, NNG), (증가, NNG), ..."
7,2017.01.08.,"[(인니, NNG), (투자은행, NNG), (시장, NNG), (보고서, NNG)..."
8,2017.01.09.,"[(불황, NNG), (물가, NNG), (껑충, MAG), (한국, NNG), (..."
9,2017.01.10.,"[(올해, NNG), (오피스, NNG), (시장, NNG), (침체, NNG), ..."


In [24]:
# 태깅된 형태에서 문자만 빼오기
# total_news['result'] = ''

# for i in range(len(total_news)):
#     result = ''
#     for j in range(len(total_news['lemmas'][i])):
#         result += total_news['lemmas'][i][j][0][0] + ' '
#     total_news.at[i, 'result'] = result.strip()

# total_news