In [1]:
from random import shuffle
from tqdm import tqdm
import re
import os
import sys
sys.path.append('../rusenteval')

In [2]:
import stanza
ppln = stanza.Pipeline('ru', processors='tokenize,pos')

2023-11-19 01:50:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-19 01:50:28 INFO: Loading these models for language: ru (Russian):
| Processor | Package          |
--------------------------------
| tokenize  | syntagrus        |
| pos       | syntagrus_charlm |

2023-11-19 01:50:28 INFO: Using device: cpu
2023-11-19 01:50:28 INFO: Loading: tokenize
2023-11-19 01:50:28 INFO: Loading: pos
2023-11-19 01:50:28 INFO: Done loading processors!


In [61]:
txt = 'Героем орясину готовящая ослы мощное топорище точит.'
doc = ppln(txt)

In [62]:
doc.sentences[0].words[1].upos

'NOUN'

In [63]:
feats = doc.sentences[0].words[0].feats.split('|')
data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
data_dict

{'Animacy': 'Anim', 'Case': 'Ins', 'Gender': 'Masc', 'Number': 'Sing'}

In [6]:
genders = []
def check_sents(text):
    text = re.sub(r'[^\w\s]','',text)[:-1]
    labels = []
    doc = ppln(text)
    for i in range(len(doc.sentences[0].words)):
        if doc.sentences[0].words[i].upos == 'NOUN':
            feats = doc.sentences[0].words[i].feats.split('|')
            data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
            if len(data_dict) > 2:
                try:
                    labels.append(data_dict['Gender'])
                    genders.append(data_dict['Gender'])
                except KeyError:
                    labels.append('None') 
            else:
                labels.append('None') 
        else:
            labels.append('None')
    return text, labels         

In [7]:
with open('../rusenteval/subj_gender.txt') as f:
    sents = [line.strip('\n').split('\t') for line in f]
shuffle(sents)
sents = [d[2] for d in sents if 4 < len(d[2].split()) <= 20][:40000]

In [8]:
new_sents = []
for sent in tqdm(sents):
    new_sents.append(check_sents(sent))

100%|███████████████████████████████████| 40000/40000 [5:44:24<00:00,  1.94it/s]


In [9]:
set(genders)

{'Fem', 'Masc', 'Neut'}

In [10]:
new_sents = [d for d in new_sents if d is not None]
new_sents = [d for d in new_sents if d[1].count('None') != len(d[1]) and len(d[0].split()) == len(d[1])]

In [11]:
new_sents[119]

('Сохранение этой и других артелей районные власти считали своей первоочередной задачей',
 ['Neut',
  'None',
  'None',
  'None',
  'Masc',
  'None',
  'Fem',
  'None',
  'None',
  'None',
  'Fem'])

In [12]:
len(new_sents)

39710

In [13]:
per_token = [d for d in new_sents if d[1].count('None') <= len(d[1]) - 2 and len(d[0].split()) == len(d[1])]

In [64]:
per_sent = [d for d in new_sents if d[1].count('None') + 1 == len(d[1]) and len(d[0].split()) == len(d[1])]

In [65]:
len(per_token)

36467

In [66]:
len(per_sent)

3243

## per_token

In [145]:
shuffle(per_token)
fem = []
masc = []
neut = []
for text in per_token:
    if text[1].count('Neut') > 0:
        neut.append(text)
    elif text[1].count('Fem') > 0:
        fem.append(text)
    elif text[1].count('Masc') > 0:
        masc.append(text)

In [146]:
shuffle(fem)
shuffle(masc)
fem = fem[:550]
masc = masc[:100]
neut = neut[:2350]

In [147]:
print(len(fem), len(masc), len(neut))

550 100 2350


In [148]:
fem.extend(masc)

In [149]:
len(fem)

650

In [150]:
fem.extend(neut)

In [151]:
len(fem)

3000

In [152]:
shuffle(fem)

In [153]:
# сохраним в файл
filename_out = '/per_token/ru_rusenteval_NOUN_Gender.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        output.write(str(line[1])[1:-1])
        output.write('\n')

In [154]:
fem_count = 0
masc_count = 0
neut_count = 0
for text in fem:
    fem_count += text[1].count('Fem')
    masc_count += text[1].count('Masc')
    neut_count += text[1].count('Neut')
print(fem_count, masc_count, neut_count)

3669 4138 3653


## per_sent

In [67]:
fem = []
masc = []
neut = []
for text in per_sent:
    if text[1].count('Fem') > 0:
        fem.append(text)
    elif text[1].count('Masc') > 0:
        masc.append(text)
    elif text[1].count('Neut') > 0:
        neut.append(text)

In [68]:
shuffle(fem)
shuffle(masc)
fem = fem[:1000]
masc = masc[:1000]
neut = neut[:1000]

In [69]:
print(len(fem), len(masc), len(neut))

1000 1000 1000


In [70]:
fem.extend(masc)

In [71]:
fem.extend(neut)

In [72]:
shuffle(fem)

In [73]:
# сохраним в файл
filename_out = '/per_sent/ru_rusenteval_NOUN_Gender.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        label = [label for label in line[1] if label != 'None']
        assert len(label) == 1
        output.write(label[0])
        output.write('\n')

In [74]:
fem_count = 0
masc_count = 0
neut_count = 0
for text in fem:
    fem_count += text[1].count('Fem')
    masc_count += text[1].count('Masc')
    neut_count += text[1].count('Neut')
print(fem_count, masc_count, neut_count)

1000 1000 1000
