In [2]:
import pandas as pd
from ekonlpy.tag import Mecab

class DataPreprocessing:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path, index_col=0)
        self.mecab = Mecab()
        self.stopPos = ['NNP', 'NNB', 'NNBC', 'NR', 'NP',
                        'VX', 'VCP', 'MM', 'MAJ', 'IC', 'JKS',
                        'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ',
                        'JX', 'JC', 'EP', 'EF', 'EC', 'ETN', 'ETM',
                        'XPN', 'XSN', 'XSV', 'XSA', 'XR', 'SF', 'SE',
                        'SSO', 'SSCSY', 'SSC', 'SC', 'SY', 'SN']
    
    def making_df(self):
        df = self.df.assign(pos_tagging="")
        return df
    
    def pos_tag(self, text):
        return self.mecab.pos(text)
    
    def rm_stopPos(self, text):
        return [word for word in text if word[1] not in self.stopPos]
    
    def synonyms(self, text):
        return self.mecab.replace_synonyms(text)
    
    def lemmas(self, text):
        return self.mecab.lemmatize(text)

    def preprocess_data(self):
        total_news = self.making_df()
        total_news['pos_tagging'] = total_news['text'].apply(self.pos_tag)
        total_news['remove_stopPos'] = total_news['pos_tagging'].apply(self.rm_stopPos)
        total_news['synonyms'] = total_news['remove_stopPos'].apply(self.synonyms)
        total_news['result'] = total_news['synonyms'].apply(self.lemmas)
        return total_news[['result', 'up_down']]
    
    print("전처리 진행중")
        

data_processor = DataPreprocessing('./news_1000.csv')
preprocessed_data = data_processor.preprocess_data()

preprocessed_data.to_csv('./save_news.csv')
print(preprocessed_data)
print("전처리 완료")

전처리 진행중
                                                result up_down
0    [(미, NNG), (증시, NNG), (대공황, NNG), (이후, NNG), (...      하락
1    [(불황, NNG), (절세미인, NNG), (뜬다, VV), (불확실성, NNG)...      하락
2    [(대통령, NNG), (MB, SL), (정부, NNG), (새해, NNG), (...      하락
3    [(한은, NNG), (은행, NNG), (신경전, NNG), (새해, NNG), ...      하락
4    [(한은, NNG), (RP, NNG), (매각, NNG), (낙찰, NNG), (...      하락
..                                                 ...     ...
995  [(금리, NNG), (그리, MAG), (저등, NNG), (급자, NNG), (...      상승
996  [(채권, NNG), (발행, NNG), (믿, VV), (있, VA), (Week...      상승
997  [(농협, NNG), (농금채, NNG), (발행, NNG), (중, NNG), (...      상승
998  [(브라질, NNG), (증시, NNG), (상승, NNG), (올해, NNG), ...      상승
999  [(캐리, NNG), (꿈틀, MAG), (저금리, NNG), (국가, NNG), ...      상승

[1000 rows x 2 columns]
전처리 완료
