In [1]:
import os
import json
import csv
import shutil
import pandas as pd
from collections import Counter, defaultdict

In [2]:
def is_contains_chinese(strs):
    for _char in strs:
        if '\u4e00' <= _char <= '\u9fa5':
            return True
    return False

In [3]:
# student id = 108102021 葉丞恩
# student id = 108101019 董沂函
# student id = 108101528 梁恩維
# student id = 108101510 陳姿吟
# student id = wen 文紫柔
# student id = YI-YING LI 李羿瑩
# student id = Mavis 許舒婷
# teacher id = jahui
# teacher id = fangxiang
# teacher id = Siaw-Fong

name_id = 'wen'
dataset = 'train'

text_dir = './text_and_tagging/Fairytale/train/content'
event_dir = f'./text_and_tagging/tags/{name_id}/Fairytale/Event'
relation_dir = f'./text_and_tagging/tags/{name_id}/Fairytale/Relation'
entity_dir = f'./text_and_tagging/tags/{name_id}/Fairytale/Entity'
attr_dir = f'./text_and_tagging/Fairytale/{dataset}/attribute'

## Attribute

## Event Count

In [4]:
file_names = os.listdir(text_dir)
eng_story = []
black_list = ['.ipynb_checkpoints']

for file_name in file_names:
    if file_name in black_list:
        continue
    if not is_contains_chinese(file_name):
        eng_story.append(file_name.replace('.txt', ''))

labeled_event_fn = os.listdir(event_dir)
labeled_event_story = []
for fn in labeled_event_fn:
    if fn.replace('.json', '') in eng_story:
        labeled_event_story.append(fn)

print('全部英文故事：', len(eng_story))
print('(Event)已標記英文故事：', len(labeled_event_story))

全部英文故事： 4038
(Event)已標記英文故事： 41


In [5]:
story_event_counter = Counter()
event_type_counter = Counter()
role_type_counter = Counter()
attribute_f = set()
event_attribute_type = defaultdict(list)

for story in eng_story:
    story_event_counter[story.replace('.txt', '')] = 0

for fn in labeled_event_story:
    path = os.path.join(event_dir, fn)
    with open(path, 'r', encoding='utf-8') as f:
        record = json.load(f)
    for event in record:
        eventType = event['Event_type']
        event_type_counter.update({eventType:1})
        story_event_counter.update({fn.replace('.json', ''):1})
        
        attribute_path = os.path.join(attr_dir, event['which_QA'])
        with open(attribute_path, 'r', encoding = 'utf-8') as atti_f: 
            attribute_type = atti_f.readline()
            attribute_f.add(attribute_path)
            event_attribute_type[attribute_type].append(eventType)
            
        roles = event['Arguments']
        for role in roles:
            if 'Trigger' in role['Arg_type']:
                continue
            roleType = role['Arg_type']
            role_type_counter.update({roleType:1})
       
total_events = 0
total_roles = 0

for tup in event_type_counter.most_common():
    total_events += tup[1]
for tup in role_type_counter.most_common():
    total_roles += tup[1]
    
print('共標記 {:} 個事件'.format(total_events))
print('共標記 {:} 個事件參數 (role)'.format(total_roles))

print('=== 事件類別統計 ===')
for tup in event_type_counter.most_common():
    et, count = tup
    print(et, ':', count)
    
# df = pd.DataFrame.from_dict(story_event_counter, orient='index').reset_index()
# df = df.rename(columns={'index':'story name', 0:'count'})
# df.to_excel('./result.xlsx')

共標記 86 個事件
共標記 188 個事件參數 (role)
=== 事件類別統計 ===
Action : 66
State : 18
Life : 2


## Event Attribute 統計

In [6]:
attribute_type_counter = defaultdict(list)

for path in attribute_f:
    with open(path, 'r', encoding = 'utf-8') as f:
        attribute = f.readline()
        attribute_type_counter[attribute].append(path.split('/')[-1].replace('.txt', ''))

print(attribute_type_counter)
print("")
print(event_attribute_type)        

print('=== Event Attribute 統計 ===')
for key in attribute_type_counter.keys():
    print(key, len(event_attribute_type[key]) / len(attribute_type_counter[key]) )

defaultdict(<class 'list'>, {'action': ['adventures-of-kintaro-golden-boy12-1', 'adventures-of-kintaro-golden-boy07-3', 'a-legend-of-knockmany17-2', 'a-legend-of-knockmany06-4', 'adventures-of-kintaro-golden-boy03-1', 'a-legend-of-knockmany13-1', 'adventures-of-kintaro-golden-boy01-1', 'a-legend-of-knockmany11-1', 'a-legend-of-knockmany02-1', 'a-legend-of-knockmany07-4', 'adventures-of-kintaro-golden-boy06-2', 'a-legend-of-knockmany06-5', 'a-legend-of-knockmany10-1', 'adventures-of-kintaro-golden-boy10-4', 'a-legend-of-knockmany07-2', 'a-legend-of-knockmany05-1', 'a-legend-of-knockmany16-3', 'adventures-of-kintaro-golden-boy02-2', 'adventures-of-kintaro-golden-boy01-10', 'a-legend-of-knockmany10-2', 'a-legend-of-knockmany04-1', 'adventures-of-kintaro-golden-boy09-1', 'a-legend-of-knockmany08-1', 'a-legend-of-knockmany08-2', 'a-legend-of-knockmany12-1', 'a-legend-of-knockmany14-1', 'adventures-of-kintaro-golden-boy12-2', 'a-legend-of-knockmany06-6', 'a-legend-of-knockmany18-1', 'a-legen

## Event bipartite 輸出

In [7]:
# print(event_attribute_type)

register = []

for key in event_attribute_type:
    event_count = Counter(event_attribute_type[key])
    for e_type in event_count:
        register.append([key, e_type, event_count[e_type]])

if not os.path.isdir(f'bipartite_output_format/{name_id}'):
    os.mkdir(f'bipartite_output_format/{name_id}')

with open(f'bipartite_output_format/{name_id}/Event.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter = ',')
    for i in register:
        writer.writerow(i)

## Relation Count

In [8]:
labeled_relation_fn = os.listdir(relation_dir)
labeled_relation_story = []
for fn in labeled_relation_fn:
    if fn.replace('.json', '') in eng_story:
        labeled_relation_story.append(fn)

print('全部英文故事：', len(eng_story))
print('(Relation)已標記英文故事：', len(labeled_relation_story))

全部英文故事： 4038
(Relation)已標記英文故事： 41


In [9]:
story_relation_counter = Counter()
relation_type_counter = Counter()
role_type_counter = Counter()
relation_attribute_type = defaultdict(list)
attribute_f = set()

for story in eng_story:
    story_relation_counter[story.replace('.txt', '')] = 0

for fn in labeled_relation_story:
    path = os.path.join(relation_dir, fn)
    with open(path, 'r', encoding='utf-8') as f:
        record = json.load(f)
    for relation in record:
        relationType = relation['Relation_type']
        relation_type_counter.update({relationType:1})
        story_relation_counter.update({fn.replace('.json', ''):1})
        subtype = relation['Relation_subtype']
        role_type_counter.update({subtype:1})
        
        attribute_path = os.path.join(attr_dir, relation['which_QA'])
        with open(attribute_path, 'r', encoding = 'utf-8') as atti_f: 
            attribute_type = atti_f.readline()
            attribute_f.add(attribute_path)
            relation_attribute_type[attribute_type].append(relationType)
            
total_relations = 0
total_roles = 0

for tup in relation_type_counter.most_common():
    total_relations += tup[1]
for tup in role_type_counter.most_common():
    total_roles += tup[1]

print('共標記 {:} 個Relation'.format(total_relations))
print('共標記 {:} 個Relation_subtype參數 '.format(total_roles))

print('=== Relation subtype類別統計 ===')
for tup in role_type_counter.most_common():
    et, count = tup
    print(et, ':', count)

共標記 38 個Relation
共標記 38 個Relation_subtype參數 
=== Relation subtype類別統計 ===
Effect on X : 13
X reaction : 7
X intent : 6
Effect on other : 4
X attribute : 3
X want : 2
Other reaction : 1
Coref : 1
Other : 1


## Relation Attribute 統計

In [10]:
attribute_type_counter = defaultdict(list)

for path in attribute_f:
    with open(path, 'r', encoding = 'utf-8') as f:
        attribute = f.readline()
        attribute_type_counter[attribute].append(path.split('/')[-1].replace('.txt', ''))

print(attribute_type_counter)
print("")
print(relation_attribute_type) 

print('=== Relation Attribute 統計 ===')
for key in attribute_type_counter.keys():
    print(key, len(relation_attribute_type[key]) / len(attribute_type_counter[key]) )

defaultdict(<class 'list'>, {'causal relationship': ['a-legend-of-knockmany06-1', 'a-legend-of-knockmany14-2', 'adventures-of-kintaro-golden-boy01-7', 'a-legend-of-knockmany21-1', 'a-legend-of-knockmany07-1', 'a-legend-of-knockmany18-3', 'a-legend-of-knockmany01-1', 'a-legend-of-knockmany18-7', 'adventures-of-kintaro-golden-boy10-2', 'adventures-of-kintaro-golden-boy02-1', 'a-legend-of-knockmany01-6', 'a-legend-of-knockmany20-1', 'a-legend-of-knockmany19-1', 'adventures-of-kintaro-golden-boy10-7', 'adventures-of-kintaro-golden-boy01-6', 'a-legend-of-knockmany03-1', 'a-legend-of-knockmany25-1', 'a-legend-of-knockmany02-2', 'a-legend-of-knockmany15-1', 'adventures-of-kintaro-golden-boy04-1', 'a-legend-of-knockmany30-1', 'adventures-of-kintaro-golden-boy02-3', 'adventures-of-kintaro-golden-boy10-1', 'adventures-of-kintaro-golden-boy05-1'], 'outcome resolution': ['a-legend-of-knockmany10-4', 'a-legend-of-knockmany17-1', 'adventures-of-kintaro-golden-boy01-4'], 'action': ['a-legend-of-knock

## Relation bipartite 輸出

In [11]:
# print(relation_attribute_type)

register = []

for key in relation_attribute_type:
    relation_count = Counter(relation_attribute_type[key])
    for r_type in relation_count:
        register.append([key, r_type, relation_count[r_type]])

if not os.path.isdir(f'bipartite_output_format/{name_id}'):
    os.mkdir(f'bipartite_output_format/{name_id}')

with open(f'bipartite_output_format/{name_id}/Relation.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter = ',')
    for i in register:
        writer.writerow(i)

## Entity Count

In [12]:
labeled_entity_fn = os.listdir(entity_dir)
labeled_entity_story = []
for fn in labeled_entity_fn:
    if fn.replace('.json', '') in eng_story:
        labeled_entity_story.append(fn)

print('全部英文故事：', len(eng_story))
print('(Entity)已標記英文故事：', len(labeled_entity_story))

全部英文故事： 4038
(Entity)已標記英文故事： 61


In [13]:
story_entity_counter = Counter()
entity_type_counter = Counter()
entity_attribute_type_counter = Counter()
entity_attribute_type = defaultdict(list)

for story in eng_story:
    story_entity_counter[story.replace('.txt', '')] = 0

for fn in labeled_entity_story:
    path = os.path.join(entity_dir, fn)
    with open(path, 'r', encoding='utf-8') as f:
        record = json.load(f)
    for entity in record:
        entityType = entity['Entity_type']
        entity_type_counter.update({entityType:1})
        story_entity_counter.update({fn.replace('.json', ''):1})
        
        attribute_path = os.path.join(attr_dir, entity['which_QA'])
        with open(attribute_path, 'r', encoding = 'utf-8') as atti_f: 
            attribute_type = atti_f.readline()
            entity_attribute_type_counter.update({attribute_type:1})
            entity_attribute_type[attribute_type].append(entityType)
        
total_entity = 0

for tup in entity_type_counter.most_common():
    total_entity += tup[1]

print('共標記 {:} 個Entity'.format(total_entity))

print('=== Entity類別統計 ===')
for tup in entity_type_counter.most_common():
    et, count = tup
    print(et, ':', count)

共標記 0 個Entity
=== Entity類別統計 ===


In [14]:
print(entity_attribute_type_counter)
print(entity_attribute_type)

Counter()
defaultdict(<class 'list'>, {})
