## 연습 문제
CSV 형식 뉴스 파일 형태소 분석

In [5]:
import re
import ujson
from konlpy.tag import Hannanum


def split_sentences(text):
    """주어진 텍스트를 문장 분할하여 돌려준다."""
    
    all_sentences = []
    lines = text.strip().splitlines()
    
    for line in lines:
        if not line:
            continue
            
        sentences = re.split("(?<=[.?!]) ", line)
        all_sentences += sentences
    
    return all_sentences


def get_morph_anal(hannanum, text):
    """주어진 텍스트를 문장 단위로 형태소 분석하여 돌려준다."""
    
    sent_morph_anals = []
    sentences = split_sentences(text)
    
    for sentence in sentences:
        sent_morph_anal = hannanum.pos(sentence, ntags=22)
        sent_morph_anals.append(sent_morph_anal)
        
    return sent_morph_anals


def main():
    """CSV 형식의 뉴스 파일을 형태소 분석한다."""
    
    input_file_name = "../data/user/news_data.csv"
    output_file_name = "../data/user/news_data.ma.txt"
    keys = ["time", "Category", "Type", "TitleKorea", 
            "Contents", "TitleKorea_ma", "Contents_ma"]
    
    hannanum = Hannanum()
    
    with open(input_file_name, "r", encoding="utf-8") as input_file, \
            open(output_file_name, "w", encoding="utf-8") as output_file:
        for line_num, line in enumerate(input_file):
            if line_num == 0:
                continue
            
            try:
                time_category, news_type, title, contents = line.strip().split('","')
                time, category = time_category.split(',"')
            except:
                continue

            title_ma = get_morph_anal(hannanum, title)
            contents_ma = get_morph_anal(hannanum, contents)
            
            values = [time, category, news_type, title, contents, 
                      title_ma, contents_ma]
            
            output = {}
            
            for k, v in zip(keys, values):
                output[k] = v
                
            output_str = ujson.dumps(output, ensure_ascii=False)
            print(output_str, file=output_file)
            
            
main()