In [1]:
import os
import numpy as np
import torch
import pandas as pd

In [2]:
class InputExample(object):
    """A single training/test example for token classification."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            words: list. The words of the sequence.
            labels: (Optional) list. The labels for each word of the sequence. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.words = words
        self.labels = labels

In [3]:
def read_examples_from_file(data_dir, mode):
    file_path = os.path.join(data_dir, "{}.txt".format(mode))
    guid_index = 1
    examples = []

    with open(file_path, encoding="utf-8") as f:
        words = []
        labels = []
        for line in f:
            if line.startswith("-DOCSTART-") or not line.strip():  # "" # 문서 시작 or 문장
                if words:
                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
                                                 words=words,
                                                 labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
            else:
                splits = line.split("\t")
                if splits[0].strip():
                    words.append(splits[0])
                    if len(splits) > 1:
                        labels.append(splits[-1].replace("\n", ""))
                    else:
                        # Examples could have no label for mode = "test"
                        labels.append("O")
        if words:
            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
                                         words=words,
                                         labels=labels))
    return examples

In [4]:
def get_labels(path):
    if path:
        with open(path, "r") as f:
            labels = f.read().splitlines()
        if "O" not in labels:
            labels = ["O"] + labels
        return labels
    else:
        return ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

In [5]:
labels = get_labels("labels.txt")
label_map = {label:i for i,label in enumerate(labels)}

In [6]:
examples = read_examples_from_file("supervised", "train")

In [7]:
len(examples)

131767

In [8]:
def split_samples(examples, split_rate): # split_rate:0.2 -> coarse:fine = 8:2
    total = np.arange(len(examples))
    fine_id = np.random.choice(len(examples), int(len(examples)*split_rate), replace=False)
    coarse_id = np.setdiff1d(total, fine_id)
    
    print(f"{len(fine_id)} fine_id : {fine_id}")
    print(f"{len(coarse_id)} coarse_id : {coarse_id}")
    
    examples = np.array(examples)
    fine_examples = examples[fine_id]
    coarse_examples = examples[coarse_id]

    return fine_examples.tolist(), coarse_examples.tolist()

In [10]:
fine_examples, coarse_examples = split_samples(examples, 0.01)

1317 fine_id : [ 62262  59474 131106 ... 129993 113249  95114]
130450 coarse_id : [     0      1      2 ... 131764 131765 131766]


In [11]:
def count_features(examples, label_list):
    features = {label:0 for label in label_list}
    
    for (ex_index, example) in enumerate(examples):
        for label in example.labels:
            features[label] += 1
            
    return features

In [12]:
coarse_features = count_features(coarse_examples, labels)
fine_features = count_features(fine_examples, labels)

In [13]:
fine_features

{'art-broadcastprogram': 138,
 'art-film': 86,
 'art-music': 95,
 'art-other': 63,
 'art-painting': 4,
 'art-writtenart': 87,
 'building-airport': 13,
 'building-hospital': 33,
 'building-hotel': 22,
 'building-library': 40,
 'building-other': 258,
 'building-restaurant': 54,
 'building-sportsfacility': 70,
 'building-theater': 92,
 'event-attack/battle/war/militaryconflict': 120,
 'event-disaster': 28,
 'event-election': 21,
 'event-other': 75,
 'event-protest': 16,
 'event-sportsevent': 142,
 'location-bodiesofwater': 76,
 'location-GPE': 856,
 'location-island': 47,
 'location-mountain': 24,
 'location-other': 163,
 'location-park': 71,
 'location-road/railway/highway/transit': 103,
 'O': 26410,
 'organization-company': 351,
 'organization-education': 297,
 'organization-government/governmentagency': 147,
 'organization-media/newspaper': 92,
 'organization-other': 497,
 'organization-politicalparty': 57,
 'organization-religion': 38,
 'organization-showorganization': 52,
 'organizat

In [14]:
coarse_features

{'art-broadcastprogram': 5963,
 'art-film': 8345,
 'art-music': 10812,
 'art-other': 6133,
 'art-painting': 841,
 'art-writtenart': 11222,
 'building-airport': 3629,
 'building-hospital': 4083,
 'building-hotel': 2647,
 'building-library': 3771,
 'building-other': 24576,
 'building-restaurant': 1770,
 'building-sportsfacility': 3636,
 'building-theater': 4512,
 'event-attack/battle/war/militaryconflict': 10599,
 'event-disaster': 1804,
 'event-election': 1985,
 'event-other': 9876,
 'event-protest': 1325,
 'event-sportsevent': 15548,
 'location-bodiesofwater': 8224,
 'location-GPE': 90340,
 'location-island': 4051,
 'location-mountain': 4476,
 'location-other': 16271,
 'location-park': 4242,
 'location-road/railway/highway/transit': 14501,
 'O': 2520610,
 'organization-company': 28661,
 'organization-education': 23546,
 'organization-government/governmentagency': 14837,
 'organization-media/newspaper': 8150,
 'organization-other': 42895,
 'organization-politicalparty': 7826,
 'organiza

In [15]:
mapper = {} # fine : coarse
for label in labels:
    if label == "O":
        mapper[label] = label
    else:
        c, f = label.split("-")
        mapper[label] = c

In [16]:
# mapper

In [17]:
def make_coarse(examples, mapper):
    new_examples = []
    for (ex_index, example) in enumerate(examples):

        new_labels = []
        for word, label in zip(example.words, example.labels):
            new_label = mapper[label]
            new_labels.append(new_label)
              
        new_examples.append(InputExample(guid= example.guid,
                                             words=example.words,
                                             labels=new_labels))
        
    print(len(new_examples))
    return new_examples

In [18]:
new_examples = make_coarse(coarse_examples, mapper)

130450


In [19]:
def make_file(examples, title):
    with open(f"./supervised/{title}.txt", "w") as file:
        for (ex_index, example) in enumerate(examples):
            for word, label in zip(example.words, example.labels):
                file.write(f"{word}\t{label}\n")
            file.write("\n")

In [20]:
make_file(new_examples,"coarse")

In [21]:
make_file(fine_examples,"fine")

# Save

In [22]:
with open("./supervised/num_of_features.txt", "w") as file:
    file.write("coarse_features \n")
    file.write(f"{coarse_features} \n")
    file.write("fine_features \n")
    file.write(f"{fine_features}")