In [1]:
from random import shuffle
from tqdm import tqdm
import re
import os
import sys
sys.path.append('../rusenteval')

In [2]:
import stanza
ppln = stanza.Pipeline('ru', processors='tokenize,pos')

2023-11-19 01:50:42 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-19 01:50:43 INFO: Loading these models for language: ru (Russian):
| Processor | Package          |
--------------------------------
| tokenize  | syntagrus        |
| pos       | syntagrus_charlm |

2023-11-19 01:50:43 INFO: Using device: cpu
2023-11-19 01:50:43 INFO: Loading: tokenize
2023-11-19 01:50:43 INFO: Loading: pos
2023-11-19 01:50:43 INFO: Done loading processors!


In [3]:
txt = 'Философский Об орясину готовящая осёл мощное топорище точит.'
doc = ppln(txt)

In [4]:
len(doc.sentences[0].words)

9

In [5]:
feats = doc.sentences[0].words[0].feats.split('|')
data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
data_dict

{'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}

In [6]:
genders = []
def check_sents(text):
    text = re.sub(r'[^\w\s]','',text)[:-1]
    labels = []
    doc = ppln(text)
    for i in range(len(doc.sentences[0].words)):
        if doc.sentences[0].words[i].upos == 'ADJ':
            feats = doc.sentences[0].words[i].feats.split('|')
            data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
            if len(data_dict) > 2:
                try:
                    labels.append(data_dict['Gender'])
                    genders.append(data_dict['Gender'])
                except KeyError:
                    labels.append('None')          
            else:
                labels.append('None') 
        else:
            labels.append('None')
    return text, labels          

In [7]:
with open('../rusenteval/obj_gender.txt') as f:
    sents = [line.strip('\n').split('\t') for line in f]
shuffle(sents)
sents = [d[2] for d in sents if 4 < len(d[2].split()) <= 20][:40000]

In [8]:
new_sents = []
for sent in tqdm(sents):
    new_sents.append(check_sents(sent))

100%|███████████████████████████████████| 40000/40000 [5:46:29<00:00,  1.92it/s]


In [9]:
set(genders)

{'Fem', 'Masc', 'Neut'}

In [10]:
new_sents = [d for d in new_sents if d is not None]
new_sents = [d for d in new_sents if d[1].count('None') != len(d[1]) and len(d[0].split()) == len(d[1])]

In [11]:
per_token = [d for d in new_sents if d[1].count('None') <= len(d[1]) - 2 and len(d[0].split()) == len(d[1])]

In [12]:
per_sent = [d for d in new_sents if d[1].count('None') + 1 == len(d[1]) and len(d[0].split()) == len(d[1])]

In [13]:
len(per_token)

6990

In [14]:
len(per_sent)

12571

## per_token

In [201]:
shuffle(per_token)
fem = []
masc = []
neut = []
for text in per_token:
    if text[1].count('Neut') > 0:
        neut.append(text)
    elif text[1].count('Fem') > 0:
        fem.append(text)
    elif text[1].count('Masc') > 0:
        masc.append(text)

In [202]:
from random import shuffle
shuffle(fem)
shuffle(masc)
fem = fem[:900]
masc = masc[:500]
neut = neut[:1600]

In [203]:
print(len(fem), len(masc), len(neut))

900 500 1600


In [204]:
fem.extend(masc)

In [205]:
len(fem)

1400

In [206]:
fem.extend(neut)

In [207]:
len(fem)

3000

In [208]:
shuffle(fem)

In [209]:
# сохраним в файл
filename_out = '/per_token/ru_rusenteval_ADJ_Gender.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        output.write(str(line[1])[1:-1])
        output.write('\n')

In [210]:
fem_count = 0
masc_count = 0
neut_count = 0
for text in fem:
    fem_count += text[1].count('Fem')
    masc_count += text[1].count('Masc')
    neut_count += text[1].count('Neut')
print(fem_count, masc_count, neut_count)

2426 2509 2304


## per_sent

In [43]:
fem = []
masc = []
neut = []
for text in per_sent:
    if text[1].count('Fem') > 0:
        fem.append(text)
    elif text[1].count('Masc') > 0:
        masc.append(text)
    elif text[1].count('Neut') > 0:
        neut.append(text)

In [44]:
from random import shuffle
shuffle(fem)
shuffle(masc)
fem = fem[:1000]
masc = masc[:1000]
neut = neut[:1000]

In [45]:
print(len(fem), len(masc), len(neut))

1000 1000 1000


In [46]:
fem.extend(masc)

In [47]:
fem.extend(neut)

In [48]:
len(fem)

3000

In [51]:
shuffle(fem)

In [52]:
# сохраним в файл
filename_out = '/per_sent/ru_rusenteval_ADJ_Gender.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        label = [label for label in line[1] if label != 'None']
        assert len(label) == 1
        output.write(label[0])
        output.write('\n')

In [53]:
fem_count = 0
masc_count = 0
neut_count = 0
for text in fem:
    fem_count += text[1].count('Fem')
    masc_count += text[1].count('Masc')
    neut_count += text[1].count('Neut')
print(fem_count, masc_count, neut_count)

1000 1000 1000
