In [1]:
import os
import random
import pandas as pd

In [2]:
ui_path = "../interim_1/nergithub.txt"
ugm_path = "../interim_1/nerugm.txt"

with open(ui_path,'r') as reader:
    ui_data = reader.read().strip().split('\n\n')

with open(ugm_path,'r') as reader:
    ugm_data = reader.read().strip().split('\n\n')

In [3]:
ui_data[5:7]

['Menurut\tO\nLaks\tB-PERSON\n,\tO\nstatus\tO\nmulti\tO\n-\tO\nlisting\tO\nbagi\tO\nTelkom\tB-ORGANIZATION\npenting\tO\ndalam\tO\nrangka\tO\nmenjaga\tO\ncitra\tO\nperseroan\tO\n,\tO\nwalaupun\tO\nsebetulnya\tO\nsaham\tO\nTelkom\tB-ORGANIZATION\nbisa\tO\ndiserap\tO\ninvestor\tO\nlokal\tO\n.\tO',
 'PT\tB-ORGANIZATION\nTelekomunikasi\tI-ORGANIZATION\nIndonesia\tI-ORGANIZATION\nTbk\tI-ORGANIZATION\n(\tO\nTelkom\tB-ORGANIZATION\n)\tO\nakan\tO\nmelakukan\tO\npembiayaan\tO\nkembali\tO\nutang\tO\n(\tO\ndebt\tO\nrefinancing\tO\n)\tO\ndalam\tO\nvaluta\tO\nasing\tO\n(\tO\nvalas\tO\n)\tO\n,\tO\nguna\tO\nmengurangi\tO\nrugi\tO\nvalas\tO\nakibat\tO\nfluktuasi\tO\nnilai\tO\ntukar\tO\n.\tO']

In [4]:
def preprocess_line(line):
    splitted_line = line.split('\n')
    text = []
    named_entity = {}
    prev_bio_tag = None
    current_named_entity = []
    for bio_tag_token in splitted_line:
        token, bio_tag = bio_tag_token.split('\t')
        text.append(token)

        # 1. if bio tag is B
        #   1.1. prev tag is None
        #   1.2. prev tag is O
        #   1.3. prev tag is B
        #       1.3.1. prev tag label is same
        #       1.3.2. prev tag label is different
        #   1.4. prev tag is I
        #       1.4.1. prev tag label is same
        #       1.4.2. prev tag label is different
        # 2. if bio tag is I
        #   2.1. prev tag is None
        #   2.2. prev tag is O
        #   2.3. prev tag is B
        #       2.3.1. prev tag label is same
        #       2.3.2. prev tag label is different
        #   2.4. prev tag is I
        #       2.4.1. prev tag label is same
        #       2.4.2. prev tag label is different
        # 3. if bio tag is O
        #   3.1. prev tag is None
        #   3.2. prev tag is O
        #   3.3. prev tag is B
        #   3.4. prev tag is I
        if bio_tag.startswith("B-"):
            if prev_bio_tag == None:
                # Append token
                current_named_entity.append(token)
            elif prev_bio_tag == 'O':
                # Append token
                current_named_entity.append(token)
            elif prev_bio_tag.startswith("B-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
                else:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
            elif prev_bio_tag.startswith("I-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
                else:
                    if not prev_tag_label in named_entity.keys():
                        named_entity[prev_tag_label] = []
                    named_entity[prev_tag_label].append(' '.join(current_named_entity))
                    current_named_entity = [token]
        elif bio_tag.startswith("I-"):
            if prev_bio_tag == None:
                raise Exception(f"I token cannot begin a named entity phrase | line : {line}")
            elif prev_bio_tag == 'O':
                raise Exception(f"I token cannot begin a named entity phrase | line : {line}")
            elif prev_bio_tag.startswith("B-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    # Append token
                    current_named_entity.append(token)
                else:
                    raise Exception(f"I token cannot align with B token with different label | line : {line}")
            elif prev_bio_tag.startswith("I-"):
                tag_label = bio_tag[2:]
                prev_tag_label = prev_bio_tag[2:]
                if tag_label == prev_tag_label:
                    # Append token
                    current_named_entity.append(token)
                else:
                    raise Exception(f"I token cannot align with I token with different label | line : {line}")
        elif bio_tag == 'O':
            if prev_bio_tag == None:
                pass
            elif prev_bio_tag == 'O':
                pass
            elif prev_bio_tag.startswith("B-"):
                prev_tag_label = prev_bio_tag[2:]
                if not prev_tag_label in named_entity.keys():
                    named_entity[prev_tag_label] = []
                named_entity[prev_tag_label].append(' '.join(current_named_entity))
                current_named_entity = []
            elif prev_bio_tag.startswith("I-"):
                prev_tag_label = prev_bio_tag[2:]
                if not prev_tag_label in named_entity.keys():
                    named_entity[prev_tag_label] = []
                named_entity[prev_tag_label].append(' '.join(current_named_entity))
                current_named_entity = []
        prev_bio_tag = bio_tag
    
    if prev_bio_tag != 'O': # last token
        prev_tag_label = prev_bio_tag[2:]
        if not prev_tag_label in named_entity.keys():
            named_entity[prev_tag_label] = []
        named_entity[prev_tag_label].append(' '.join(current_named_entity))
        current_named_entity = []
    
    return ' '.join(text), named_entity

In [5]:
ui_data = [preprocess_line(line) for line in ui_data]

Resolve error in ugm data

In [6]:
ugm_path = "../raw/nerugm.txt"
with open(ugm_path,'r') as reader:
    ugm_data = reader.read().strip().split('\n\n')

res_ugm_data = []
for line in ugm_data:
    token_tag = line.split('\n')
    res_tokens = []
    prev_tag = 'O'
    for tok_tag in token_tag:
        token, tag = tok_tag.split('\t')
        if prev_tag == 'O' and tag.startswith("I-"):
            tag = "B-" + tag[2:]
        prev_tag = tag
        tok_tag = token + '\t' + tag
        res_tokens.append(tok_tag)
    res_ugm_data.append('\n'.join(res_tokens))

In [7]:
with open("../interim_1/nerugm.txt",'w') as writer:
    for line in res_ugm_data:
        writer.write(line + '\n\n')

In [8]:
ugm_data = [preprocess_line(line) for line in res_ugm_data]

In [9]:
ui_data[0]

('Sementara itu Pengamat Pasar Modal Dandossi Matram mengatakan , sulit bagi sebuah kantor akuntan publik ( KAP ) untuk dapat menyelesaikan audit perusahaan sebesar Telkom dalam waktu 3 bulan .',
 {'PERSON': ['Dandossi Matram'],
  'ORGANIZATION': ['kantor akuntan publik', 'KAP', 'Telkom']})

In [10]:
data = ui_data + ugm_data

In [11]:
value_counts = {}
for el in data:
    text, named_entity = el
    for k,v in named_entity.items():
        if k not in value_counts.keys():
            value_counts[k] = 0
        value_counts[k] += len(v)

In [12]:
value_counts

{'PERSON': 3366,
 'ORGANIZATION': 2772,
 'LOCATION': 2256,
 'TIME': 433,
 'QUANTITY': 481}

In [14]:
data[0]

('Sementara itu Pengamat Pasar Modal Dandossi Matram mengatakan , sulit bagi sebuah kantor akuntan publik ( KAP ) untuk dapat menyelesaikan audit perusahaan sebesar Telkom dalam waktu 3 bulan .',
 {'PERSON': ['Dandossi Matram'],
  'ORGANIZATION': ['kantor akuntan publik', 'KAP', 'Telkom']})

In [16]:
mask = "<extra_id_X>"
data_frame = []
for el in data:
    text, named_entity = el
    target = []
    cnt = 0
    for entity_type, entities in named_entity.items():
        for entity in entities:
            cnt = cnt % 100
            entity_type = entity_type.lower()
            target.append(mask.replace('X',str(cnt)) + ' ' + entity + ' ' + mask.replace('X',str(cnt+1)) + ' ' + entity_type)
            cnt += 2
    target = ' ; '.join(target)
    target = target.strip()
    # if len(named_entity) > 0:
    #     chosen_named_entity = max(named_entity,key=lambda x : value_counts[x])
    #     output = named_entity[chosen_named_entity]
    # else:
    #     chosen_named_entity = random.choice(list(value_counts.keys()))
    #     output = "NONE"
    # chosen_named_entity = chosen_named_entity.replace('_',' ')
    # prompt = f"Ekstrak seluruh entitas {chosen_named_entity} di dalam teks"
    # if output != "NONE":
    #     output = ' , '.join(output)
    data_frame.append({
        "input" : f"Ekstrak NER dengan format >> entity : <extra_id_0>, entity_type : <extra_id_1> | {text}",
        "output" : target
    })
data_frame = pd.DataFrame(data_frame)

In [19]:
data_frame.loc[0,"output"]

'<extra_id_0> Dandossi Matram <extra_id_1> person ; <extra_id_2> kantor akuntan publik <extra_id_3> organization ; <extra_id_4> KAP <extra_id_5> organization ; <extra_id_6> Telkom <extra_id_7> organization'

In [21]:
data_frame.to_csv("../interim_2/data.csv",index=False)