In [1]:
from pathlib import Path
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

import os
import pandas as pd

from split_datalawyer import split_utils, SentenceSplit
from split_datalawyer.modules import ForceDropDuplicatedModule, ReplaceModule, ReplaceLongWordsModule, ReplaceConcatenatedDotModule
from split_long_sentences import split_long_sentences

from transformers import BertTokenizerFast

In [2]:
bert_tokenizer = BertTokenizerFast.from_pretrained('neuralmind/bert-base-portuguese-cased')
sentence_split = SentenceSplit(
        modules=[ForceDropDuplicatedModule(), ReplaceModule(), ReplaceLongWordsModule(),
                 ReplaceConcatenatedDotModule()])

In [3]:
# train_csv = Path('train.csv')
# test_csv = Path('test.csv')
# base_path = Path('/media/discoD/repositorios/entidades/dataset/lener')
base_path = Path('.')
train_raw_conll = base_path / 'train_raw.conll'
train_partial_conll = base_path / 'train_partial.conll'
dev_partial_conll = base_path / 'dev_partial.conll'
full_partial_conll = base_path / 'full_partial.conll'

train_split_conll = base_path / 'train_split.conll'
dev_split_conll = base_path / 'dev_split.conll'
full_split_conll = base_path / 'full_split.conll'

train_conll = base_path / 'train.conll'
dev_conll = base_path / 'dev.conll'
test_conll = base_path / 'test.conll'
full_conll = base_path / 'full.conll'
train_jsonl = base_path / 'train.json'
dev_jsonl = base_path / 'dev.json'
full_jsonl = base_path / 'full.json'

In [4]:
def load_sentences(path, separator=' ', test=False):
    """
    Load sentences. A line must contain at least a word and its tag.
    Sentences are separated by empty lines.
    """
    sentences = []
    sentence = []
    for line in tqdm(path.open(mode='r', encoding='utf8')):
        line = line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            word = line.split(sep=separator)
            if not test:
                assert len(word) >= 2
            sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    print('Finished loading %d sentences from %s' % (len(sentences), path))
    return sentences

def greater_than_limit(text: str, max_seq_len: int = 512) -> bool:
    return len(bert_tokenizer.tokenize(text)) > (max_seq_len - 2)

# def load_sentences(dataframe: pd.DataFrame):
#     sentences = []
#     sentence = None
#     last_sentence_id = None
#     for idx, row in tqdm(dataframe.iterrows()):
#         sentence_id = row['Sentence_Id']
#         if sentence_id != last_sentence_id:
#             if sentence is not None:
#                 sentences.append(sentence)
#             sentence = []
#         sentence.append((row['Word'], row['Tag']))
#         last_sentence_id = sentence_id
#     print('Finished loading %d sentences' % len(sentences))
#     return sentences

def count_tags(sentences):
    df = pd.DataFrame({'Tag': [_tuple[1] for sentence in sentences for _tuple in sentence]})
    return df.groupby('Tag').size().reset_index(name='counts')

def write_conll(sentences, output_path: Path, test: bool = False):
    with output_path.open(mode='w', encoding='utf-8') as out_file:
        for sentence in tqdm(sentences):
            for _tuple in sentence:
                if test:
                    out_file.write(' '.join([_tuple[0], 'O', 'O', 'O']) + '\n')
                else:
                    out_file.write(' '.join([_tuple[0], 'O', 'O', _tuple[1]]) + '\n')
            out_file.write('\n')
    out_file.close()
    
def check_long_sentences(sentences):
    long_sentences = []
    results = []
    for _sentence_tuples in tqdm(sentences, 'Checking sentences size...'):
        tokens = [_tuple[0] for _tuple in _sentence_tuples]
        sentence = ' '.join(tokens)
        if greater_than_limit(sentence):
            splits = split_utils.split(sentence, usar_ponto_virgula=False)
            for split in splits:
                split_tokens = split.split()
                if greater_than_limit(split):
                    print('Size in tokens: %d' % len(split_tokens))
                    print(split)
                    long_sentences.append(split)
                results.append(split)
        else:
            results.append(sentence)
    print('%d sentences can be split into %d smaller ones' % (len(sentences), len(results)))
    return long_sentences

def split_sentences(input_path: Path, output_path: Path):
    split_long_sentences(in_path=input_path, out_path=output_path, split_by_semicolon=True)
    
def convert_conll_to_jsonl(conll_path, output_path, test=False):
    sentences = load_sentences(conll_path, test=test)
    dicts = []
    for sentence in tqdm(sentences, 'Saving sentences to json...'):
        sentence_dict = {
            'tokens': [],
            'ner_tags': []
        }
        for _tuple in sentence:
            sentence_dict['tokens'].append(_tuple[0])
            if test:
                sentence_dict['ner_tags'].append('O')
            else:
                sentence_dict['ner_tags'].append(_tuple[-1])
        dicts.append(sentence_dict)
    df = pd.DataFrame(dicts)
    df.to_json(output_path, orient='records', lines=True)
    print('Finished converting %d sentences to json' % len(df))
    
def write_submission(conll_predictions_path: str, 
                     submissions_path: str = None):
    sentences = load_sentences(Path(conll_predictions_path))
    count = 0
    with Path(submissions_path).open(mode='w', encoding='utf8') as out_file:
        out_file.write('Id,Tag\n')
        for sentence in tqdm(sentences, 'Writing submission...'):
            for token_label in sentence:
                count += 1
                token = token_label[0]
                prediction = token_label[-1]
                fixed_prediction = prediction.replace('U-', 'B-').replace('L-', 'I-')
                out_file.write(','.join([str(count), fixed_prediction]) + '\n')
            out_file.write('\n')
            count += 1
    out_file.close()

def convert_transformers_predictions_to_conll(transformers_predictions_path: str, 
                                              conll_input_path: str, 
                                              conll_output_path: str, 
                                              submissions_path: str = None,
                                              test: bool = False):
    sentences = load_sentences(Path(conll_input_path), test=test)
    predictions_lines = Path(transformers_predictions_path).open(mode='r', encoding='utf-8').readlines()
    predictions = [line.strip().split() for line in predictions_lines]
    assert len(sentences) == len(predictions)
    with Path(conll_output_path).open(mode='w', encoding='utf8') as out_file:
        for sentence, predictions in tqdm(zip(sentences, predictions), 'Writing predictions...'):
            for token_label, prediction in zip(sentence, predictions):
                if test:
                    out_file.write(' '.join([token_label[0], prediction]) + '\n')
                else:
                    out_file.write(' '.join([token_label[0], token_label[-1], prediction]) + '\n')
            out_file.write('\n')
    os.system("./%s < %s > %s" % ('conlleval.perl', conll_output_path, conll_output_path.replace('predictions_', 'scores_').replace('.conll', '.txt')))
    if submissions_path is not None:
        write_submission(conll_predictions_path=conll_output_path, 
                         submissions_path=submissions_path)

In [8]:
sentences = load_sentences(train_raw_conll)

314386it [00:00, 636424.32it/s]

Finished loading 7552 sentences from train_raw.conll





In [6]:
# _ = load_sentences(Path('/media/discoD/repositorios/entidades/dataset/lener/train_all.conll'))

In [9]:
# all_train_sentences = load_sentences(train_conll)

In [10]:
# all_dev_sentences = load_sentences(dev_conll)

In [11]:
# all_test_sentences = load_sentences(test_conll)

In [65]:
count_tags(load_sentences(Path('test_oficial.conll')))

14079it [00:00, 1123102.49it/s]

Finished loading 2840 sentences from test_oficial.conll





Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,118
1,B-LEGISLACAO,194
2,B-LOCAL,53
3,B-ORGANIZACAO,113
4,B-PESSOA,113
5,B-TEMPO,92
6,I-JURISPRUDENCIA,343
7,I-LEGISLACAO,1082
8,I-LOCAL,23
9,I-ORGANIZACAO,155


In [13]:
count_tags(load_sentences(Path('/media/discoD/repositorios/entidades/dataset/lener/train_all.conll')))

328464it [00:00, 747378.88it/s]


Finished loading 10392 sentences from /media/discoD/repositorios/entidades/dataset/lener/train_all.conll


Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,1496
1,B-LEGISLACAO,2695
2,B-LOCAL,767
3,B-ORGANIZACAO,3462
4,B-PESSOA,2068
5,B-TEMPO,1760
6,I-JURISPRUDENCIA,3874
7,I-LEGISLACAO,15622
8,I-LOCAL,1026
9,I-ORGANIZACAO,6184


In [12]:
count_tags(sentences)

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,1378
1,B-LEGISLACAO,2501
2,B-LOCAL,714
3,B-ORGANIZACAO,3349
4,B-PESSOA,1955
5,B-TEMPO,1668
6,I-JURISPRUDENCIA,3531
7,I-LEGISLACAO,14540
8,I-LOCAL,1003
9,I-ORGANIZACAO,6029


In [14]:
train_split, test_split = train_test_split(sentences, test_size=0.2, random_state=42)
print(len(train_split), len(test_split))

6041 1511


In [15]:
count_tags(train_split)

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,1077
1,B-LEGISLACAO,2010
2,B-LOCAL,533
3,B-ORGANIZACAO,2620
4,B-PESSOA,1525
5,B-TEMPO,1348
6,I-JURISPRUDENCIA,2770
7,I-LEGISLACAO,11669
8,I-LOCAL,742
9,I-ORGANIZACAO,4737


In [16]:
count_tags(test_split)

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,301
1,B-LEGISLACAO,491
2,B-LOCAL,181
3,B-ORGANIZACAO,729
4,B-PESSOA,430
5,B-TEMPO,320
6,I-JURISPRUDENCIA,761
7,I-LEGISLACAO,2871
8,I-LOCAL,261
9,I-ORGANIZACAO,1292


In [17]:
write_conll(train_split, train_partial_conll)
write_conll(test_split, dev_partial_conll)

100%|██████████| 6041/6041 [00:00<00:00, 53233.00it/s]
100%|██████████| 1511/1511 [00:00<00:00, 49910.17it/s]


In [79]:
write_conll(load_sentences(test_conll, test=True), Path('test_conll2003.conll'), test=True)

14079it [00:00, 1394601.37it/s]
100%|██████████| 2840/2840 [00:00<00:00, 455519.06it/s]

Finished loading 2840 sentences from test.conll





In [19]:
split_sentences(train_raw_conll, full_conll)
split_sentences(train_partial_conll, train_conll)
split_sentences(dev_partial_conll, dev_conll)


Loading sentences from train_raw.conll

Rewrote 7552 sentences after splitting with length greater than 200

Loading sentences from train_partial.conll

Rewrote 6041 sentences after splitting with length greater than 200

Loading sentences from dev_partial.conll

Rewrote 1511 sentences after splitting with length greater than 200


In [54]:
convert_conll_to_jsonl(full_conll, full_jsonl)
convert_conll_to_jsonl(train_conll, train_jsonl)
convert_conll_to_jsonl(dev_conll, dev_jsonl)

313703it [00:00, 643053.42it/s] 
Saving sentences to json...: 100%|██████████| 7841/7841 [00:00<00:00, 115810.57it/s]
131361it [00:00, 1313604.99it/s]

Finished loading 7841 sentences from full.conll
Finished converting 7841 sentences to json


251788it [00:00, 1219517.60it/s]
Saving sentences to json...: 100%|██████████| 6294/6294 [00:00<00:00, 107715.64it/s]
61915it [00:00, 1195022.47it/s]
Saving sentences to json...: 100%|██████████| 1547/1547 [00:00<00:00, 84029.48it/s]

Finished loading 6294 sentences from train.conll
Finished converting 6294 sentences to json
Finished loading 1547 sentences from dev.conll
Finished converting 1547 sentences to json





In [62]:
convert_conll_to_jsonl(test_conll, 'test.json', test=True)

14079it [00:00, 1227888.34it/s]
Saving sentences to json...: 100%|██████████| 2840/2840 [00:00<00:00, 663241.84it/s]

Finished loading 2840 sentences from test.conll
Finished converting 2840 sentences to json





In [64]:
convert_conll_to_jsonl(Path('test_oficial.conll'), Path('test_oficial.json'))

14079it [00:00, 1313250.14it/s]
Saving sentences to json...: 100%|██████████| 2840/2840 [00:00<00:00, 466362.20it/s]

Finished loading 2840 sentences from test_oficial.conll
Finished converting 2840 sentences to json





In [29]:
train_sentences = load_sentences(train_conll)

251788it [00:00, 667113.96it/s] 

Finished loading 6294 sentences from train.conll





In [33]:
check_long_sentences(train_sentences)

Checking sentences size...: 100%|██████████| 6294/6294 [00:01<00:00, 5039.10it/s]

6294 sentences can be split into 6294 smaller ones





In [45]:
long_sentences = check_long_sentences(load_sentences(dev_conll))

61915it [00:00, 1135422.08it/s]
Checking sentences size...:  63%|██████▎   | 972/1547 [00:00<00:00, 4893.06it/s]

Finished loading 1547 sentences from dev.conll
Size in tokens: 239
MARCELO PEREIRA CRUVINEL AUTUAÇÃO AGRAVANTE : TENILAS ROCHA DIAS ADVOGADO : MANOEL CUNHA LACERDA E OUTRO ( S ) AGRAVANTE : PAULO SALINET DIAS ADVOGADA : ALEXANDRA BERTON SCHIAVINATO E OUTRO ( S ) AGRAVANTE : HAMSSI TAHA ADVOGADO : MILTON FERNANDO TALZI E OUTRO ( S ) AGRAVANTE : WAGNER MEIRA ALVES ADVOGADOS : ALEXANDRE DE SÁ DOMINGUES E OUTRO ( S ) RICARDO FANTI IACONO AGRAVADO : MINISTÉRIO PÚBLICO FEDERAL CORRÉU : JOÃO MARCOS LOURENÇÃO DA SILVA CORRÉU : MANOEL CUNHA LACERDA CORRÉU : JOSEPH NOUR EDDINE NASRALLAH ADVOGADA : ALEXANDRA BERTON SCHIAVINATO CORRÉU : CLÉBER LUIS QUINHÕES CORRÉU : ATEF YOUSSEF NEHME HARB CORRÉU : MAFAWAD METANIS TOUMA CORRÉU : DIMITRIOS BOURLIS CORRÉU : GEORGE BOUNICOLAS CORRÉU : ANTÔNIO LUIZ RIBEIRO DA SILVA ASSUNTO : DIREITO PENAL - Crimes Previstos na Legislação Extravagante - Crimes de Tráfico Ilícito e Uso Indevido de Drogas AGRAVO REGIMENTAL AGRAVANTE : HAMSSI TAHA ADVOGADO : JOÃO ANGELILD

Checking sentences size...: 100%|██████████| 1547/1547 [00:00<00:00, 4938.91it/s]

1547 sentences can be split into 1550 smaller ones





In [49]:
splits = sentence_split.get_sentences(long_sentences[0], split_by_semicolon=True)
len(splits)

1

In [50]:
splits = split_utils.split(long_sentences[0], usar_ponto_virgula=True)
len(splits)

1

In [98]:
convert_transformers_predictions_to_conll(transformers_predictions_path='/media/discoD/models/ner/competicao-ner-transformers/predictions.txt', 
                                          conll_input_path=test_conll, 
                                          conll_output_path='predictions_test_transformers.conll', 
                                          submissions_path='submission_transformers.csv', 
                                          test=True)

14079it [00:00, 1017446.99it/s]
Writing predictions...: 2840it [00:00, 172760.31it/s]


Finished loading 2840 sentences from test.conll


14079it [00:00, 1119016.24it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 121917.46it/s]

Finished loading 2840 sentences from predictions_test_transformers.conll





In [102]:
convert_transformers_predictions_to_conll(transformers_predictions_path='/media/discoD/models/ner/competicao-ner-transformers/predictions.txt', 
                                          conll_input_path='test_oficial_conll2003.conll', 
                                          conll_output_path='predictions_test_oficial_transformers.conll')

14079it [00:00, 1219444.63it/s]
Writing predictions...: 2840it [00:00, 396175.99it/s]

Finished loading 2840 sentences from test_oficial_conll2003.conll





In [92]:
write_submission(conll_predictions_path='predictions_test_elmo.conll', 
                 submissions_path='submission_elmo.csv')

14079it [00:00, 1216630.74it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 221937.39it/s]

Finished loading 2840 sentences from predictions_test_elmo.conll





In [94]:
write_submission(conll_predictions_path='predictions_test_allennlp.txt', 
                 submissions_path='submission_allennlp_bert_base.csv')

14079it [00:00, 1013100.57it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 128008.42it/s]

Finished loading 2840 sentences from predictions_test_allennlp.txt





In [95]:
write_submission(conll_predictions_path='predictions_test_allennlp_bert_large.txt', 
                 submissions_path='submission_allennlp_bert_large.csv')

14079it [00:00, 1284484.50it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 117667.39it/s]

Finished loading 2840 sentences from predictions_test_allennlp_bert_large.txt





In [96]:
write_submission(conll_predictions_path='predictions_test_elmo_brwac_cnn_word2vec-jur.txt', 
                 submissions_path='submission_elmo_brwac_cnn_word2vec-jur.csv')

14079it [00:00, 865694.31it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 156695.34it/s]

Finished loading 2840 sentences from predictions_test_elmo_brwac_cnn_word2vec-jur.txt





In [97]:
write_submission(conll_predictions_path='predictions_test_elmo_brwac_word2vec-jur.txt', 
                 submissions_path='submission_elmo_brwac_word2vec-jur.csv')

14079it [00:00, 1085088.59it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 110361.08it/s]

Finished loading 2840 sentences from predictions_test_elmo_brwac_word2vec-jur.txt





In [103]:
write_submission(conll_predictions_path='predictions_oficial_bert-base-full.txt', 
                 submissions_path='submission_oficial_bert-base-full.csv')

14079it [00:00, 1246182.54it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 266352.65it/s]

Finished loading 2840 sentences from predictions_oficial_bert-base-full.txt





In [104]:
write_submission(conll_predictions_path='predictions_oficial_elmo-pt-brwac_word2vec-jur-full.txt', 
                 submissions_path='submission_oficial_elmo-pt-brwac_word2vec-jur-full.csv')

14079it [00:00, 1317969.11it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 177148.56it/s]

Finished loading 2840 sentences from predictions_oficial_elmo-pt-brwac_word2vec-jur-full.txt





In [105]:
write_submission(conll_predictions_path='predictions_oficial_allennlp_models.txt', 
                 submissions_path='submission_oficial_allennlp_models.csv')

14079it [00:00, 524931.16it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 189458.49it/s]

Finished loading 2840 sentences from predictions_oficial_allennlp_models.txt





In [7]:
write_submission(conll_predictions_path='predictions_oficial_allennlp_models_lstm.txt', 
                 submissions_path='submission_oficial_allennlp_models_lstm.csv')

14079it [00:00, 209496.46it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 273000.33it/s]

Finished loading 2840 sentences from predictions_oficial_allennlp_models_lstm.txt





In [5]:
write_submission(conll_predictions_path='predictions_oficial_allennlp_optuna.txt', 
                 submissions_path='submission_oficial_allennlp_optuna.csv')

14079it [00:00, 181458.28it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 160307.69it/s]

Finished loading 2840 sentences from predictions_oficial_allennlp_optuna.txt





In [5]:
write_submission(conll_predictions_path='predictions_oficial_allennlp_optuna_best.txt', 
                 submissions_path='submission_oficial_allennlp_optuna_best.csv')

14079it [00:00, 1256363.69it/s]
Writing submission...: 100%|██████████| 2840/2840 [00:00<00:00, 256797.81it/s]

Finished loading 2840 sentences from predictions_oficial_allennlp_optuna_best.txt



