## Pre-Processing and Characterization a corpus

In [None]:
#!pip3 install -U future 
import os
import sys
import numpy as np
import pandas as pd
import pickle
#import pickle5 as pickle
from math import nan
from future.utils import iteritems
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

#current_path = os.path.abspath(os.path.dirname(__file__))
#previus_path = os.path.join(current_path, '../')
previus_path = './BIO_data/'

train_file = previus_path + 'train.bio.mod.h5'
test_file  = previus_path + 'test.bio.mod.h5'
dev_file   = previus_path + 'dev.bio.mod.h5'

## Recovery data function

In [None]:
def get_dataf(archiv, key, descp=False):
    _dta = pd.read_hdf(archiv, key)
    
    if descp:
        print(df.head())

    return _dta

## Description of the training file

In [None]:
dta_tr = get_dataf(train_file, 'df1')
df_tr  = dta_tr[['word', 'ner', '[tokpos]']]
print(df_tr['ner'].value_counts())

words_tr = list(set(dta_tr["word"].values))
n_words_tr = len(words_tr)

tags_tr = []
for tag in set(dta_tr["ner"].values):
    if tag is nan or isinstance(tag, float):
        tags_tr.append('unk')
    else:
        tags_tr.append(tag)

n_tags_tr = len(tags_tr)

print('No. Palabras Unicas: ', n_words_tr)
print('    No. Tags Unicas: ', n_tags_tr)

## Description of the testing file

In [None]:
dta_ts = get_dataf(test_file, 'df2')
df_ts  = dta_ts[['word', 'ner', '[tokpos]']]
print(df_ts['ner'].value_counts())

words_ts = list(set(dta_ts["word"].values))
n_words_ts = len(words_ts)

tags_ts = []
for tag in set(dta_ts["ner"].values):
    if tag is nan or isinstance(tag, float):
        tags_ts.append('unk')
    else:
        tags_ts.append(tag)
        
n_tags_ts = len(tags_ts)

print('No. Palabras Unicas: ', n_words_ts)
print('    No. Tags Unicas: ', n_tags_ts)

## Description of the evaluate file

In [None]:
dta_dev = get_dataf(dev_file, 'df3')
df_dev  = dta_dev[['word', 'ner', '[tokpos]']]
print(df_dev['ner'].value_counts())

words_dev = list(set(dta_dev["word"].values))
n_words_dev = len(words_dev)

tags_dev = []
for tag in set(dta_dev["ner"].values):
    if tag is nan or isinstance(tag, float):
        tags_dev.append('unk')
    else:
        tags_dev.append(tag)
        
n_tags_dev = len(tags_dev)

print('No. Palabras Unicas: ', n_words_dev)
print('    No. Tags Unicas: ', n_tags_dev)

## Joint description of all data sets

In [None]:
frames = [df_tr, df_ts, df_dev]

df = pd.concat(frames)
    

print(df['ner'].value_counts())

words = list(set(df["word"].values))
n_words = len(words)

tags = []
for tag in set(df["ner"].values):
    if tag is nan or isinstance(tag, float):
        tags.append('unk')
    else:
        tags.append(tag)
        
n_tags = len(tags)

print('No. Palabras Unicas: ', n_words)
print('    No. Tags Unicas: ', n_tags)

## Sentence extraction function

In [None]:
def extract_data(dtas):
    data = []
    subdata = []
    for index, row in dtas.iterrows():
        if row['[tokpos]'] != 'EOS':
            subdata.append((row['word'], row['ner']))
        else:
            subdata.append((row['word'], row['ner']))
            data.append(subdata)
            subdata = []
             
    return data

## Sentence Extraction Training DataSet

In [None]:
sentences_train = extract_data(df_tr)
print(sentences_train[1:3])

with open("../vectors/sentences_train.txt", "wb") as fp:
    pickle.dump(sentences_train, fp)

## Sentence Extraction Test DataSet

In [None]:
sentences_test = extract_data(df_ts)
print(sentences_test[1:3])

with open("../vectors/sentences_test.txt", "wb") as fp:
    pickle.dump(sentences_test, fp)

## Sentence Extraction Eval DataSet

In [None]:
sentences_dev = extract_data(df_dev)
print(sentences_dev[1:3])

with open("../vectors/sentences_dev.txt", "wb") as fp:
    pickle.dump(sentences_dev, fp)

## Creation of dictionaries of words and tags

In [None]:
word2idx = {w: i + 2  for i, w in enumerate(words)}
word2idx['-PAD-'] = 0  # The special value used for padding
word2idx['-OOV-'] = 1  # The special value used for OOVs

tag2idx = {t: i + 2  for i, t in enumerate(tags)}
tag2idx['-PAD-'] = 0  # The special value used to padding
tag2idx['-OOV-'] = 1  # The special value used to OOVs

idx2tag = {v: k for k, v in iteritems(tag2idx)}

#np.save('../vectors/word2index.npy', word2idx)
#np.save('../vectors/tag2index.npy', tag2idx)
#np.save('../vectors/index2tag.npy', idx2tag)

print('**** Diccionario de palabras: ****\n')
for key, value in word2idx.items():
    if value == 10:
        break
    else:
        print(key, ' : ', value)

print('\n**** Diccionario de tags: ****\n')
for key, value in tag2idx.items():
    if value == 10:
        break
    else:
        print(key, ' : ', value)

print('\n**** array de tags: ****\n')
print(idx2tag)

## Calculation of the maximum length of sentences

In [None]:
maxlen_train = max([len(s) for s in sentences_train])
print('longitud oraciones entraneminto: ',  maxlen_train)

import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.hist([len(s) for s in sentences_train], bins=50)
plt.title('Numero de palabras por oracion')
plt.xlabel('Longitud Oracion')
plt.ylabel('# oraciones')
plt.show()

In [None]:
maxlen_test = max([len(s) for s in sentences_test])
print('\nlongitud oraciones pruebas: ',  maxlen_test)

import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.hist([len(s) for s in sentences_test], bins=50)
plt.title('Numero de tokens por oracion')
plt.xlabel('Longitud Oracion')
plt.ylabel('# oraciones')
plt.show()

In [None]:
maxlen_dev = max([len(s) for s in sentences_dev])
print('\nlongitud oraciones Evaluación: ',  maxlen_dev)

import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.hist([len(s) for s in sentences_dev], bins=50)
plt.title('Numero de tokens por oracion')
plt.xlabel('Longitud Oracion')
plt.ylabel('# oraciones')
plt.show()

In [None]:
maxlen = max([maxlen_train, maxlen_test, maxlen_dev])

print('\nlongitud oraciones: ', maxlen)

## Extracting training dataset in classes and categories mode

In [None]:
X_train = [[word2idx[w[0]] for w in s] for s in sentences_train]
X_train = pad_sequences(maxlen=maxlen, sequences=X_train, padding="post",value=word2idx["-PAD-"])

print(X_train[0])

#np.save('../vectors/X_train.npy', X_train)

#X_train_cat = [to_categorical(i, num_classes=n_words+2) for i in X_train]

In [None]:
y_train = [[tag2idx[w[1]] for w in s] for s in sentences_train]
y_train = pad_sequences(maxlen=maxlen, sequences=y_train, padding="post", value=tag2idx["-PAD-"])
y_train = [to_categorical(i, num_classes=n_tags+2) for i in y_train]

print(y_train[0][0])
print(y_train[0])

#np.save('../vectors/y_train.npy', y_train)

## Extracting testing dataset in classes and categories mode

In [None]:
X_test = [[word2idx[w[0]] for w in s] for s in sentences_test]
X_test = pad_sequences(maxlen=maxlen, sequences=X_test, padding="post",value=word2idx["-PAD-"])

print(X_test[0])

#np.save('../vectors/X_test.npy', X_test)

#X_test_cat = [to_categorical(i, num_classes=n_words+2) for i in X_test]

In [None]:
y_test = [[tag2idx[w[1]] for w in s] for s in sentences_test]
y_test = pad_sequences(maxlen=maxlen, sequences=y_test, padding="post", value=tag2idx["-PAD-"])
y_test = [to_categorical(i, num_classes=n_tags+2) for i in y_test]

print(y_test[0][0])
print(y_test[0])

#np.save('../vectors/y_test.npy', y_test)

## Extracting eval dataset in classes and categories mode

In [None]:
X_dev = [[word2idx[w[0]] for w in s] for s in sentences_dev]
X_dev = pad_sequences(maxlen=maxlen, sequences=X_dev, padding="post",value=word2idx["-PAD-"])

print(X_dev[0])

#np.save('../vectors/X_dev.npy', X_dev)

#X_dev_cat = [to_categorical(i, num_classes=n_words+2) for i in X_dev]

In [None]:
y_dev = [[tag2idx[w[1]] for w in s] for s in sentences_dev]

print(type(y_dev))
print(len(y_dev[0]))

y_dev = pad_sequences(maxlen=maxlen, sequences=y_dev, padding="post", value=tag2idx["-PAD-"])

print(type(y_dev))
#print(y_dev)

y_dev = [to_categorical(i, num_classes=n_tags+2) for i in y_dev]

print(type(y_dev))
#print(y_dev)

print(y_dev[0][0])
print(y_dev[0])

#np.save('../vectors/y_dev.npy', y_dev)