In [1]:
from random import shuffle
from tqdm import tqdm
import re
import os
import sys
sys.path.append('../rusenteval')

In [2]:
import stanza
ppln = stanza.Pipeline('ru', processors='tokenize,pos')

2023-11-19 15:51:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-19 15:51:12 INFO: Loading these models for language: ru (Russian):
| Processor | Package          |
--------------------------------
| tokenize  | syntagrus        |
| pos       | syntagrus_charlm |

2023-11-19 15:51:12 INFO: Using device: cpu
2023-11-19 15:51:12 INFO: Loading: tokenize
2023-11-19 15:51:12 INFO: Loading: pos
2023-11-19 15:51:13 INFO: Done loading processors!


In [3]:
txt = 'Об орясину готовящая муниципальные мощное топорище точит.'
doc = ppln(txt)

In [4]:
doc.sentences[0].words[1].upos

'NOUN'

In [5]:
feats = doc.sentences[0].words[3].feats.split('|')
data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
data_dict

{'Animacy': 'Inan', 'Case': 'Acc', 'Degree': 'Pos', 'Number': 'Plur'}

In [6]:
cases = []
def check_sents(text):
    text = re.sub(r'[^\w\s]','',text)[:-1]
    labels = []
    doc = ppln(text)
    for i in range(len(doc.sentences[0].words)):
        if doc.sentences[0].words[i].upos == 'NOUN':
            feats = doc.sentences[0].words[i].feats.split('|')
            data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
            if len(data_dict) > 2:
                try:
                    labels.append(data_dict['Case'])
                    cases.append(data_dict['Case'])
                except KeyError:
                    labels.append('None') 
            else:
                labels.append('None') 
        else:
            labels.append('None')
    return text, labels         

In [7]:
with open('../rusenteval/obj_gender.txt') as f:
    sents = [line.strip('\n').split('\t') for line in f]
shuffle(sents)
sents = [d[2] for d in sents if 4 < len(d[2].split()) <= 20]

In [8]:
new_sents = []
for sent in tqdm(sents):
    new_sents.append(check_sents(sent))

100%|█████████████████████████████████| 117807/117807 [6:18:05<00:00,  5.19it/s]


In [9]:
new_sents = [d for d in new_sents if d is not None]
new_sents = [d for d in new_sents if d[1].count('None') != len(d[1]) and len(d[0].split()) == len(d[1])]

In [10]:
new_sents[119]

('Лидеры организации отвергают эти обвинения',
 ['Nom', 'Gen', 'None', 'None', 'Acc'])

In [11]:
len(new_sents)

117498

In [12]:
per_token = [d for d in new_sents if d[1].count('None') <= len(d[1]) - 2 and len(d[0].split()) == len(d[1])]

In [13]:
per_sent = [d for d in new_sents if d[1].count('None') + 1 == len(d[1]) and len(d[0].split()) == len(d[1])]

In [14]:
len(per_token)

100791

In [15]:
len(per_sent)

16707

In [16]:
set(cases)

{'Acc', 'Dat', 'Gen', 'Ins', 'Loc', 'Nom', 'Par'}

## per_token

In [300]:
shuffle(per_token)

In [301]:
fem = []
masc = []
neut = []
Ins = []
Loc = []
Nom = []
for text in per_token:
    if text[1].count('Par') > 0:
        continue
    elif text[1].count('Acc') > 0:
        fem.append(text)
    elif text[1].count('Dat') > 0:
        masc.append(text)
    elif text[1].count('Loc') > 0:
        Loc.append(text)
    elif text[1].count('Gen') > 0:
        neut.append(text)
    elif text[1].count('Ins') > 0:
        Ins.append(text)
    elif text[1].count('Nom') > 0:
        Nom.append(text)

In [302]:
len(fem)

82895

In [303]:
len(masc)

1288

In [304]:
fem = fem[:900]
masc = masc[:900]
Ins = Ins[:500]
Loc = Loc[:700]

In [305]:
print(len(fem), len(masc), len(neut), len(Ins), len(Loc), len(Nom))

900 900 12699 500 700 484


In [306]:
fem.extend(masc)

In [307]:
len(fem)

1800

In [308]:
#fem.extend(neut)

In [309]:
len(fem)

1800

In [310]:
fem.extend(Ins)

In [311]:
fem.extend(Loc)

In [312]:
#fem.extend(Nom)

In [313]:
len(fem)

3000

In [314]:
shuffle(fem)

In [315]:
# сохраним в файл
filename_out = '/per_token/ru_rusenteval_NOUN_Case.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        output.write(str(line[1])[1:-1])
        output.write('\n')

In [316]:
fem_count = 0
masc_count = 0
neut_count = 0
ins = 0
loc = 0
nom = 0
for text in fem:
    fem_count += text[1].count('Acc')
    masc_count += text[1].count('Dat')
    neut_count += text[1].count('Gen')
    ins += text[1].count('Ins')
    loc += text[1].count('Loc')
    nom += text[1].count('Nom')
print(fem_count, masc_count, neut_count, ins, loc, nom)

1069 1089 3829 1412 1004 2386


## per_sent

In [326]:
len(per_sent)

16707

In [327]:
shuffle(per_sent)

In [355]:
fem = []
masc = []
neut = []
Ins = []
Loc = []
Nom = []
for text in per_sent:
    if text[1].count('Par') > 0:
        continue
    elif text[1].count('Acc') > 0:
        fem.append(text)
    elif text[1].count('Dat') > 0:
        masc.append(text)
    elif text[1].count('Gen') > 0:
        neut.append(text)
    elif text[1].count('Ins') > 0:
        Ins.append(text)
    elif text[1].count('Loc') > 0:
        Loc.append(text)
    elif text[1].count('Nom') > 0:
        Nom.append(text)

In [356]:
(3000-2000-17-11)/4

243.0

In [357]:
fem = fem[:773]
masc = masc[:500]
neut = neut[:743]
Ins = Ins[:743]
Loc = Loc[:500]
Nom = Nom[:743]

In [358]:
print(len(fem), len(masc), len(neut), len(Ins), len(Loc), len(Nom))

773 17 743 743 11 713


In [359]:
fem.extend(masc)

In [360]:
fem.extend(neut)

In [361]:
fem.extend(Ins)

In [362]:
fem.extend(Loc)

In [363]:
fem.extend(Nom)

In [364]:
shuffle(fem)

In [365]:
len(fem)

3000

In [366]:
# сохраним в файл
filename_out = '/per_sent/ru_rusenteval_NOUN_Case.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        label = [label for label in line[1] if label != 'None']
        assert len(label) == 1
        output.write(label[0])
        output.write('\n')

In [367]:
fem_count = 0
masc_count = 0
neut_count = 0
ins = 0
loc = 0
nom = 0
for text in fem:
    fem_count += text[1].count('Acc')
    masc_count += text[1].count('Dat')
    neut_count += text[1].count('Gen')
    ins += text[1].count('Ins')
    loc += text[1].count('Loc')
    nom += text[1].count('Nom')
print(fem_count, masc_count, neut_count, ins, loc, nom)

773 17 743 743 11 713
