In [1]:
import re
from kiwipiepy import Kiwi
import os
import json

PATH1 = './data/sentence_tokenize'
PATH2 = './data/pos_tokenize'
PATH3 = './data/extract_aux'
KIWI = Kiwi(num_workers=2, load_default_dict=True, model_type='knlm')

# 괄호 안의 내용과 바로 앞의 내용이 같을 경우 괄호 부분 삭제 e.g., 이념(이념)
def remove_ss(sentence) :
    p = re.compile(r'[[|{|()}].+?[]|}|)]')

    result = ''
    m = re.search(p, sentence)
    while m is not None : 
        in_ss = sentence[m.start()+1:m.end()-1]
        front_ss = sentence[m.start()-len(in_ss):m.start()]
        if in_ss == front_ss :
            result = result + sentence[:m.start()]
        sentence = sentence[m.end():]
        m = re.search(p, sentence)
    result = result + sentence

    return result

# path의 있는 json 파일들을 불러와 사전형태로 변환하고 리스트 형태로 반환
def load_writings(path) :
    result = []
    for file_name in os.listdir(path) :
        print('load file:\t' + file_name)
        file = open(path + '/' + file_name, 'r', encoding='utf-8')
        result.append(json.load(file))
        file.close()
    return result

# 파일 저장을 위해 토큰들을 사전의 리스트로 변환
def tokens_to_list_of_dict(tokens) :
    result = []
    for token in tokens : 
        result.append({
            'form':token[0], 
            'tag':token[1], 
            'start':token[2], 
            'len':token[3]
        })
    return result

In [2]:
# load pos tokenize 
writings = load_writings(PATH1)

load file:	SOUR001103&&& 독립신문 창간호 논설.json
load file:	SOUR001276&&& 옛날 꿈은창백하더이다.json
load file:	SOUR001379&&& 백팔번뇌 (시조집).json
load file:	SOUR001427&&& 날개.json
load file:	SOUR001439&&& 국경의 밤.json
load file:	SOUR001450&&& 해(해)에게서 소년에게.json
load file:	SOUR001456&&& 무정(무정).json
load file:	SOUR001477&&& 을지문덕 (을지문덕).json
load file:	SOUR001478&&& 이순신전(李순신전).json
load file:	SOUR001479&&& 최도통전(최도통전).json
load file:	SOUR001520&&& 감자.json
load file:	SOUR001521&&& 배따라기.json
load file:	SOUR001522&&& 상록수 (상록수).json
load file:	SOUR001547&&& 백치 아다다.json
load file:	SOUR001548&&& 운수 좋은 날.json
load file:	SOUR001549&&& 물레방아.json
load file:	SOUR001550&&& 표본실의 청개구리.json
load file:	SOUR001561&&& 진달래꽃 (시집).json
load file:	SOUR001614&&& 빈처 (빈처).json
load file:	SOUR001615&&& 술 권하는 사회.json
load file:	SOUR001616&&& B사감과 러브레터.json
load file:	SOUR001641&&& 메밀꽃 필 무렵.json
load file:	SOUR001642&&& 동백꽃.json
load file:	SOUR001647&&& 따라지.json
load file:	SOUR001648&&& 운현궁의 봄.json
load file:	SOUR001649&&& 탁류(탁류).json
load f

In [None]:
'''
    pattern
    1   -   처럼
    2   -   같은

    참고 : https://encykorea.aks.ac.kr/Article/E0054525
'''

### 패턴 1 '처럼' 에 대해서 보조관념 추출

In [3]:
# choose sentences by pattern1
pattern1 = {}
for writing in writings : 
    for sentence in writing['sentences'] :
        if len(sentence) <= 0 : break
        if len(sentence) >= 1000 : break

        sentence = remove_ss(sentence)

        tokens = []
        if '처럼' in sentence : 
            tokens = KIWI.analyze(sentence)[0][0]
        
        if len(tokens) != 0 :
            pattern1[sentence] = {'tokens': tokens_to_list_of_dict(tokens)}

  p = re.compile(r'[[|{|()}].+?[]|}|)]')


In [4]:
# save pattern1 sentences
with open(PATH2+'/pattern1.json', 'w', encoding='utf-8') as file :
    json.dump(pattern1, file, indent='\t', ensure_ascii=False)

In [5]:
# load pattern1 sentences
pattern1 = None
with open(PATH2+'/pattern1.json', 'r', encoding='utf-8') as file : 
    pattern1 = json.load(file)

In [6]:
# extract auxiliary ideas from pattern1 sentences
auxiliary_ideas = {}

for key, value in pattern1.items() :
    sentence = key
    tokens = value['tokens']
    
    # find pos token of pattern1
    pattern_indexes = []
    for i in range(len(tokens)) :
        token = tokens[i]
        if token['form'] == '처럼' and token['tag'] == 'JKB' :
            pattern_indexes.append(i)
    
    # find auxiliary idea pos token of pattern1
    for index in pattern_indexes :
        auxiliary_idea = []

        auxiliary_idea.insert(0, tokens[index])

        index = index-1
        current = tokens[index]
        if current['tag'] not in ['NNG', 'NNP', 'NR', 'NP', 'NNB'] : 
            current = None

        while current != None : 
            auxiliary_idea.insert(0, current)
            
            if index == 0 : 
                break

            current_tag = current['tag']
            next_tag = tokens[index-1]['tag']
            
            # S1
            if current_tag in ['NNB'] :
                if next_tag in ['JKG', 'MM', 'ETM'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S2
            if current_tag in ['XSN'] :
                nnext_tag = tokens[index-2]['tag']
                if next_tag in ['XR'] :
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM'] :
                        auxiliary_idea.insert(0, tokens[index-1])
                        index = index-2
                        current = tokens[index]
                    else :
                        current = None
                else :
                    del auxiliary_idea[0]
                    current = None
            # S3 - Can Move Termination
            if current_tag in ['NNG', 'NNP', 'NR', 'NP'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'XPN', 'ETN', 'JKG', 'MM', 'ETM'] :
                    index = index-1
                    current = tokens[index]
                else :
                    current = None
            # S4
            if current_tag in ['ETN'] :
                if next_tag in ['VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R', 'VX', 'VX-I', 'VX-R'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S5 - Can Move Termination
            if current_tag in ['XPN'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    current = None
            # S6
            if current_tag in ['VX', 'VX-I', 'VX-R'] :
                if next_tag in ['EC'] :
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    current = None
            # S7 
            if current_tag in ['EC'] :
                if next_tag in ['VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    current = None
            # S8 - Can Move Termination
            if current_tag in ['VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R'] :
                previous_tag = tokens[index+1]['tag']
                if previous_tag in ['ETN'] : 
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM', 'JKB', 'MAG', 'EC'] :
                        index = index-1
                        current = tokens[index]
                    else :
                        current = None
                elif previous_tag in ['ETM'] :
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'MAG', 'EC'] :
                        index = index-1
                        current = tokens[index]
                    else :
                        current = None
                else :
                    root_index = index+1
                    root_tag = tokens[root_index]['tag']
                    while root_tag not in ['ETN', 'ETM'] : 
                        root_index = root_index+1
                        root_tag = tokens[root_index]['tag']
                    if root_tag in ['ETN'] : 
                        if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM', 'JKB', 'MAG', 'EC'] :
                            index = index-1
                            current = tokens[index]
                        else :
                            current = None
                    elif root_tag in ['ETM'] :
                        if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'MAG', 'EC'] :
                            index = index-1
                            current = tokens[index]
                        else :
                            current = None
            # S9
            if current_tag in ['JKB'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S10 - Can Move Termination
            if current_tag in ['MAG'] :
                current = None
            # S11
            if current_tag in ['JKG'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S12 - Can Move Termination
            if current_tag in ['MM'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    current = None
            # S13
            if current_tag in ['ETM'] :
                if next_tag in ['XSV', 'XSA', 'XSA-I', 'XSA-R', 'VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R', 'VX', 'VX-I', 'VX-R'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S14
            if current_tag in ['XSV', 'XSA', 'XSA-I', 'XSA-R'] :
                nnext_tag = tokens[index-2]['tag']
                if next_tag in ['XR'] :
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] :
                        auxiliary_idea.insert(0, tokens[index-1])
                        index = index-2
                        current = tokens[index]
                    else :
                        current = None
                else :
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    current = None
            
        if len(auxiliary_idea) != 0 :
            if sentence in auxiliary_ideas : 
                auxiliary_ideas[sentence]['auxiliary_idea'].append(auxiliary_idea)
            else :
                auxiliary_ideas[sentence] = {
                    'auxiliary_idea': [auxiliary_idea],
                    'tokens': tokens
                }
                
# find auxiliary idea string from pattern1 sentence
result1 = []
for key, value in auxiliary_ideas.items() : 
    sentence = key
    auxiliary_idea = value['auxiliary_idea']

    tmp = []
    for ai in auxiliary_idea : 
        start = ai[0]['start']
        end = ai[len(ai)-1]['start'] + ai[len(ai)-1]['len']

        tmp.append({
            'auxiliary_idea': sentence[start:end],
            'start': start,
            'end': end
        })

    auxiliary_idea = tmp

    result1.append([
        sentence, auxiliary_idea
    ])

In [7]:
# save auxiliary idea string
with open(PATH3+'/pattern1.json', 'w', encoding='utf-8') as file :
    json.dump(result1, file, indent='\t', ensure_ascii=False)

### 패턴 2 '같은' 에 대해서 보조관념 추출

In [8]:
# choose sentences by pattern1
pattern2 = {}
for writing in writings : 
    for sentence in writing['sentences'] :
        if len(sentence) <= 0 : break
        if len(sentence) >= 1000 : break

        sentence = remove_ss(sentence)

        tokens = []
        if '같은' in sentence : 
            tokens = KIWI.analyze(sentence)[0][0]
        
        if len(tokens) != 0 :
            pattern2[sentence] = {'tokens': tokens_to_list_of_dict(tokens)}

In [9]:
# save pattern2 sentences
with open(PATH2+'/pattern2.json', 'w', encoding='utf-8') as file :
    json.dump(pattern2, file, indent='\t', ensure_ascii=False)

In [10]:
# load pattern1 sentences
pattern2 = None
with open(PATH2+'/pattern2.json', 'r', encoding='utf-8') as file : 
    pattern2 = json.load(file)

In [16]:
# extract auxiliary ideas from pattern1 sentences
auxiliary_ideas = {}

for key, value in pattern1.items() :
    sentence = key
    tokens = value['tokens']
    
    # find pos token of pattern2
    pattern_indexes = []
    for i in range(len(tokens)-1) :
        token1 = tokens[i]
        token2 = tokens[i+1]
        
        cond1 = token1['form'] == '같' and token1['tag'] == 'VA'
        cond2 = token2['form'] == '은' and token2['tag'] == 'ETM'

        if cond1 and cond2 :
            pattern_indexes.append(i)

    # find auxiliary idea pos token of pattern2
    for index in pattern_indexes :
        auxiliary_idea = []

        auxiliary_idea.insert(0, tokens[index+1])
        auxiliary_idea.insert(0, tokens[index])

        index = index-1
        current = tokens[index]
        if current['tag'] not in ['NNG', 'NNP', 'NR', 'NP', 'NNB'] : 
            current = None

        while current != None : 
            auxiliary_idea.insert(0, current)

            if index == 0 : 
                break

            current_tag = current['tag']
            next_tag = tokens[index-1]['tag']
            
            # S1
            if current_tag in ['NNB'] :
                if next_tag in ['JKG', 'MM', 'ETM'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S2
            if current_tag in ['XSN'] :
                nnext_tag = tokens[index-2]['tag']
                if next_tag in ['XR'] :
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM'] :
                        auxiliary_idea.insert(0, tokens[index-1])
                        index = index-2
                        current = tokens[index]
                    else :
                        current = None
                else :
                    del auxiliary_idea[0]
                    current = None
            # S3 - Can Move Termination
            if current_tag in ['NNG', 'NNP', 'NR', 'NP'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'XPN', 'ETN', 'JKG', 'MM', 'ETM'] :
                    index = index-1
                    current = tokens[index]
                else :
                    current = None
            # S4
            if current_tag in ['ETN'] :
                if next_tag in ['VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R', 'VX', 'VX-I', 'VX-R'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S5 - Can Move Termination
            if current_tag in ['XPN'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    current = None
            # S6
            if current_tag in ['VX', 'VX-I', 'VX-R'] :
                if next_tag in ['EC'] :
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    current = None
            # S7 
            if current_tag in ['EC'] :
                if next_tag in ['VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    current = None
            # S8 - Can Move Termination
            if current_tag in ['VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R'] :
                previous_tag = tokens[index+1]['tag']
                if previous_tag in ['ETN'] : 
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM', 'JKB', 'MAG', 'EC'] :
                        index = index-1
                        current = tokens[index]
                    else :
                        current = None
                elif previous_tag in ['ETM'] :
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'MAG', 'EC'] :
                        index = index-1
                        current = tokens[index]
                    else :
                        current = None
                else :
                    root_index = index+1
                    root_tag = tokens[root_index]['tag']
                    while root_tag not in ['ETN', 'ETM'] : 
                        root_index = root_index+1
                        root_tag = tokens[root_index]['tag']
                    if root_tag in ['ETN'] : 
                        if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'JKG', 'MM', 'ETM', 'JKB', 'MAG', 'EC'] :
                            index = index-1
                            current = tokens[index]
                        else :
                            current = None
                    elif root_tag in ['ETM'] :
                        if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN', 'MAG', 'EC'] :
                            index = index-1
                            current = tokens[index]
                        else :
                            current = None
            # S9
            if current_tag in ['JKB'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S10 - Can Move Termination
            if current_tag in ['MAG'] :
                current = None
            # S11
            if current_tag in ['JKG'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S12 - Can Move Termination
            if current_tag in ['MM'] :
                if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    current = None
            # S13
            if current_tag in ['ETM'] :
                if next_tag in ['XSV', 'XSA', 'XSA-I', 'XSA-R', 'VV', 'VV-I', 'VV-R', 'VA', 'VA-I', 'VA-R', 'VX', 'VX-I', 'VX-R'] : 
                    index = index-1
                    current = tokens[index]
                else :
                    del auxiliary_idea[0]
                    current = None
            # S14
            if current_tag in ['XSV', 'XSA', 'XSA-I', 'XSA-R'] :
                nnext_tag = tokens[index-2]['tag']
                if next_tag in ['XR'] :
                    if next_tag in ['NNB', 'XSN', 'NNG', 'NNP', 'NR', 'NP', 'ETN'] :
                        auxiliary_idea.insert(0, tokens[index-1])
                        index = index-2
                        current = tokens[index]
                    else :
                        current = None
                else :
                    del auxiliary_idea[0]
                    del auxiliary_idea[0]
                    current = None
        
        if len(auxiliary_idea) != 0 :
            # print(sentence) # for log
            # print(auxiliary_idea) # for log
            # for t in tokens : 
            #     print(t)
            if sentence in auxiliary_ideas : 
                auxiliary_ideas[sentence]['auxiliary_idea'].append(auxiliary_idea)
            else :
                auxiliary_ideas[sentence] = {
                    'auxiliary_idea': [auxiliary_idea],
                    'tokens': tokens
                }        

# find auxiliary idea string from pattern2 sentence
result2 = []
for key, value in auxiliary_ideas.items() : 
    sentence = key
    auxiliary_idea = value['auxiliary_idea']

    tmp = []
    for ai in auxiliary_idea : 
        start = ai[0]['start']
        end = ai[len(ai)-1]['start'] + ai[len(ai)-1]['len']

        tmp.append({
            'auxiliary_idea': sentence[start:end],
            'start': start,
            'end': end
        })

    auxiliary_idea = tmp

    result2.append([
        sentence, auxiliary_idea
    ])

In [17]:
# save auxiliary idea string
with open(PATH3+'/pattern2.json', 'w', encoding='utf-8') as file :
    json.dump(result2, file, indent='\t', ensure_ascii=False)

### 결과 통합

In [13]:
import os, json

result = []

for file_name in os.listdir('./data/extract_aux/') : 
    with open('./data/extract_aux/' + file_name, 'r', encoding='utf-8') as file :
        result.extend(json.load(file))

with open('./data/extract_aux/result.json', 'w', encoding='utf-8') as file :
    json.dump(result, file, indent='\t', ensure_ascii=False)