In [1]:
from pathlib import Path
from tqdm import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

from split_long_sentences import split_long_sentences

In [2]:
train_csv = Path('train.csv')
test_csv = Path('test.csv')
train_partial_conll = Path('train_partial.conll')
dev_partial_conll = Path('dev_partial.conll')
full_partial_conll = Path('full_partial.conll')
train_conll = Path('train.conll')
dev_conll = Path('dev.conll')
full_conll = Path('full.conll')

In [3]:
train_pd = pd.read_csv(train_csv)
train_pd = train_pd.fillna('NA')
train_pd

Unnamed: 0,Sentence,Word,Tag
0,Sentence: 1,EMENTA,O
1,Sentence: 1,:,O
2,Sentence: 1,APELAÇÃO,O
3,Sentence: 1,CÍVEL,O
4,Sentence: 1,-,O
...,...,...,...
257753,Sentence: 8312,PROC,O
257754,Sentence: 8312,.,O
257755,Sentence: 8313,Nº,O
257756,Sentence: 8313,TST-RR-578.030/99.5,B-JURISPRUDENCIA


In [4]:
train_pd['Sentence_Id'] = train_pd.apply(lambda row: row['Sentence'].split(':')[-1], axis=1)

In [5]:
train_pd

Unnamed: 0,Sentence,Word,Tag,Sentence_Id
0,Sentence: 1,EMENTA,O,1
1,Sentence: 1,:,O,1
2,Sentence: 1,APELAÇÃO,O,1
3,Sentence: 1,CÍVEL,O,1
4,Sentence: 1,-,O,1
...,...,...,...,...
257753,Sentence: 8312,PROC,O,8312
257754,Sentence: 8312,.,O,8312
257755,Sentence: 8313,Nº,O,8313
257756,Sentence: 8313,TST-RR-578.030/99.5,B-JURISPRUDENCIA,8313


In [6]:
def load_sentences(dataframe: pd.DataFrame):
    sentences = []
    sentence = None
    last_sentence_id = None
    for idx, row in tqdm(dataframe.iterrows()):
        sentence_id = row['Sentence_Id']
        if sentence_id != last_sentence_id:
            if sentence is not None:
                sentences.append(sentence)
            sentence = []
        sentence.append((row['Word'], row['Tag']))
        last_sentence_id = sentence_id
    print('Finished loading %d sentences' % len(sentences))
    return sentences

def count_tags(sentences):
    df = pd.DataFrame({'Tag': [_tuple[1] for sentence in sentences for _tuple in sentence]})
    return df.groupby('Tag').size().reset_index(name='counts')

def write_conll(sentences, output_path: Path):
    with output_path.open(mode='w', encoding='utf-8') as out_file:
        for sentence in tqdm(sentences):
            for _tuple in sentence:
                out_file.write(' '.join([_tuple[0], 'O', 'O', _tuple[1]]) + '\n')
            out_file.write('\n')
    out_file.close()
    
def split_sentences(input_path: Path, output_path: Path):
    split_long_sentences(input_path, output_path, limit=200,
                         split_by_semicolon=True, join_punct=True)

In [7]:
sentences = load_sentences(train_pd)

257758it [00:22, 11313.48it/s]

Finished loading 8312 sentences





In [8]:
count_tags(sentences)

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,1216
1,B-LEGISLACAO,2171
2,B-LOCAL,612
3,B-ORGANIZACAO,2837
4,B-PESSOA,1678
5,B-TEMPO,1416
6,I-JURISPRUDENCIA,3146
7,I-LEGISLACAO,12628
8,I-LOCAL,806
9,I-ORGANIZACAO,5094


In [9]:
train_split, test_split = train_test_split(sentences, test_size=0.2, random_state=42)
print(len(train_split), len(test_split))

6649 1663


In [10]:
count_tags(train_split)

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,953
1,B-LEGISLACAO,1748
2,B-LOCAL,539
3,B-ORGANIZACAO,2247
4,B-PESSOA,1343
5,B-TEMPO,1132
6,I-JURISPRUDENCIA,2451
7,I-LEGISLACAO,10135
8,I-LOCAL,734
9,I-ORGANIZACAO,4099


In [11]:
count_tags(test_split)

Unnamed: 0,Tag,counts
0,B-JURISPRUDENCIA,263
1,B-LEGISLACAO,423
2,B-LOCAL,73
3,B-ORGANIZACAO,590
4,B-PESSOA,335
5,B-TEMPO,284
6,I-JURISPRUDENCIA,695
7,I-LEGISLACAO,2493
8,I-LOCAL,72
9,I-ORGANIZACAO,995


In [12]:
write_conll(sentences, full_partial_conll)
write_conll(train_split, train_partial_conll)
write_conll(test_split, dev_partial_conll)

100%|██████████| 8312/8312 [00:00<00:00, 68522.58it/s]
100%|██████████| 6649/6649 [00:00<00:00, 67421.42it/s]
100%|██████████| 1663/1663 [00:00<00:00, 64758.40it/s]


In [13]:
split_sentences(full_partial_conll, full_conll)
split_sentences(train_partial_conll, train_conll)
split_sentences(dev_partial_conll, dev_conll)


Loading sentences from full_partial.conll

Rewrote 8312 sentences after splitting with length greater than 200

Loading sentences from train_partial.conll

Rewrote 6649 sentences after splitting with length greater than 200

Loading sentences from dev_partial.conll

Rewrote 1663 sentences after splitting with length greater than 200
