In [1]:
#!pip3 install tabulate
import os
import pandas as pd
import numpy as np
from tabulate import tabulate
from IPython.display import display, HTML
import multiprocessing as mp
from multiprocessing import Pool
import pickle
pickle.HIGHEST_PROTOCOL = 4

path      = 'BIO_Lemma/'
fileinput = ['training.bio', 'testing.bio', 'developing.bio']
columnas  = ["word", "ner", "lema", "postag", "tokpos"]

In [2]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [3]:
def open_file(filename):
    file = open(filename, encoding="utf-8")
    return file

In [4]:
def parse_chunk(data):
    results = []
    flat_eos = 0
    for p, line in enumerate(data):
        try:
            parsed = dict()
            if len(line) == 0:
                flat_eos = 0
            elif len(line) == 4:
                for k, h in enumerate(columnas):
                    if k < 4:
                        parsed[h] = line[k]
                    elif k == 4 and flat_eos == 0:
                        parsed["tokpos"] = 'SOS'
                        flat_eos = 1
                    else:
                        parsed["tokpos"] = 'WOS'
                        flat_eos = 1
            results.append(parsed)
        except Exception as e:
            print(e, p)
    return results

In [5]:
def run_process(file):
    file = open_file(file).readlines()
    listify = [line.split() for line in file]

    data = chunks(listify, int(len(listify) / (mp.cpu_count())))
    p = Pool(processes=mp.cpu_count())
    results = [p.apply_async(parse_chunk, args=(list(x),)) for x in data]

    # wait for results
    results = [item.get() for item in results]
    results = sum(results, [])
    
    return results

In [6]:
k1 = run_process(path + fileinput[0])

df1 = pd.DataFrame(k1, columns=columnas)

df1 = df1[df1['word'].notna()]

df1.reset_index(drop=True, inplace=True)

df1.to_hdf(fileinput[0] + '.h5', key='df1', mode='w')

In [7]:
k2 = run_process(path + fileinput[1])

df2 = pd.DataFrame(k2, columns=columnas)

df2 = df2[df2['word'].notna()]

df2.reset_index(drop=True, inplace=True)

df2.to_hdf(fileinput[1] + '.h5', key='df2', mode='w')

In [8]:
k3 = run_process(path + fileinput[2])

df3 = pd.DataFrame(k3, columns=columnas)

df3 = df3[df3['word'].notna()]

df3.reset_index(drop=True, inplace=True)

df3.to_hdf(fileinput[2] + '.h5', key='df3', mode='w')

In [9]:
df1 = pd.DataFrame(k1, columns=columnas)
display(HTML(df1[:40].to_html()))

Unnamed: 0,word,ner,lema,postag,tokpos
0,Motivo,O,motivo,NOUN,SOS
1,de,O,de,ADP,WOS
2,consulta,O,consulta,NOUN,WOS
3,:,O,:,PUNCT,WOS
4,leer,O,leer,VERB,WOS
5,nota,O,nota,NOUN,WOS
6,de,O,de,ADP,WOS
7,Oncologia,O,oncologia,PROPN,WOS
8,mas,O,mas,PROPN,WOS
9,abajo,O,abajo,ADV,WOS


In [10]:
display(df1.ner.unique())

array(['O', nan, 'B_FAMILY', 'B_OCURRENCE_EVENT', 'I_OCURRENCE_EVENT',
       'B_CANCER_CONCEPT', 'I_CANCER_CONCEPT', 'B_CHEMOTHERAPY', 'B_TNM',
       'B_RADIOTHERAPY', 'B_CHEMOTHERAPY_DRUG', 'I_CHEMOTHERAPY_DRUG',
       'B_DATE', 'I_DATE', 'I_TNM', 'B_STADIO', 'I_STADIO', 'B_SURGERY',
       'I_SURGERY', 'I_FAMILY'], dtype=object)