In [39]:
from random import shuffle
from tqdm import tqdm
import re
import os
import sys
sys.path.append('../rusenteval')

In [40]:
import stanza
ppln = stanza.Pipeline('ru', processors='tokenize,pos')

2023-11-19 15:38:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-19 15:38:15 INFO: Loading these models for language: ru (Russian):
| Processor | Package          |
--------------------------------
| tokenize  | syntagrus        |
| pos       | syntagrus_charlm |

2023-11-19 15:38:15 INFO: Using device: cpu
2023-11-19 15:38:15 INFO: Loading: tokenize
2023-11-19 15:38:15 INFO: Loading: pos
2023-11-19 15:38:15 INFO: Done loading processors!


In [41]:
txt = 'Будет Прочитал'
doc = ppln(txt)
doc.sentences[0].words[1].upos

'VERB'

In [42]:
feats = doc.sentences[0].words[1].feats.split('|')
data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
data_dict

{'Aspect': 'Perf',
 'Gender': 'Masc',
 'Mood': 'Ind',
 'Number': 'Sing',
 'Tense': 'Past',
 'VerbForm': 'Fin',
 'Voice': 'Act'}

In [47]:
tenses = []
def check_sents(text):
    text = re.sub(r'[^\w\s]','',text)[:-1]
    labels = []
    doc = ppln(text)
    for i in range(len(doc.sentences[0].words)):
        if doc.sentences[0].words[i].upos == 'VERB':
            try:
                feats = doc.sentences[0].words[i].feats.split('|')
                data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
                if len(data_dict) > 2:
                    if data_dict['VerbForm'] == 'Fin':
                        try:
                            labels.append(data_dict['Tense'])
                            tenses.append(data_dict['Tense'])
                        except KeyError:
                            labels.append('None')
                    else:
                        labels.append('None')   
                else:
                    labels.append('None') 
            except AttributeError:
                labels.append('None') 
        else:
            labels.append('None')
    return text, labels  

In [48]:
with open('../rusenteval/predicate_tense.txt') as f:
    sents = [line.strip('\n').split('\t') for line in f]
shuffle(sents)
sents = [d[2] for d in sents if 4 < len(d[2].split()) <= 20]

In [49]:
sents[3459]

'Внёс важный вклад в рекреационную географию .'

In [50]:
new_sents = []
for sent in tqdm(sents):
    new_sents.append(check_sents(sent))

100%|█████████████████████████████████| 114405/114405 [6:06:37<00:00,  5.20it/s]


In [51]:
set(tenses)

{'Fut', 'Past', 'Pres'}

In [52]:
new_sents = [d for d in new_sents if d is not None]
new_sents = [d for d in new_sents if d[1].count('None') != len(d[1]) and len(d[0].split()) == len(d[1])]

In [60]:
per_token = [d for d in new_sents if d[1].count('None') <= len(d[1]) - 1 and len(d[0].split()) == len(d[1])]

In [61]:
per_sent = [d for d in new_sents if d[1].count('None') + 1 == len(d[1]) and len(d[0].split()) == len(d[1])]

In [62]:
len(per_token)

113910

In [63]:
len(per_sent)

113834

## per_token

In [84]:
fem = []
masc = []
neut = []
for text in per_token:
    if text[1].count('Fut') > 0:
        continue
    elif text[1].count('Pres') > 0:
        fem.append(text)
    elif text[1].count('Past') > 0:
        masc.append(text)

In [85]:
shuffle(fem)
shuffle(masc)
fem = fem[:1500]
masc = masc[:1500]

In [86]:
print(len(fem), len(masc))

1500 1500


In [87]:
fem.extend(masc)

In [88]:
len(fem)

3000

In [89]:
shuffle(fem)

In [90]:
# сохраним в файл
filename_out = '/per_token/ru_rusenteval_VERB_Tense.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        output.write(str(line[1])[1:-1])
        output.write('\n')

In [91]:
fem_count = 0
masc_count = 0
neut_count = 0
for text in fem:
    fem_count += text[1].count('Pres')
    masc_count += text[1].count('Past')
    neut_count += text[1].count('Fut')
print(fem_count, masc_count, neut_count)

1500 1500 0


## per_sent

In [101]:
fem = []
masc = []
neut = []
for text in per_sent:
    if text[1].count('Fut') > 0:
        continue
    elif text[1].count('Pres') > 0:
        fem.append(text)
    elif text[1].count('Past') > 0:
        masc.append(text)

In [102]:
shuffle(fem)
shuffle(masc)
fem = fem[:1500]
masc = masc[:1500]

In [103]:
print(len(fem), len(masc))

1500 1500


In [104]:
fem.extend(masc)

In [105]:
len(fem)

3000

In [106]:
shuffle(fem)

In [109]:
# сохраним в файл
filename_out = '/per_sent/ru_rusenteval_VERB_Tense.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in fem:
        output.write(line[0])
        output.write('\n')
        label = [label for label in line[1] if label != 'None']
        assert len(label) == 1
        output.write(label[0])
        output.write('\n')

In [110]:
fem_count = 0
masc_count = 0
neut_count = 0
for text in fem:
    fem_count += text[1].count('Pres')
    masc_count += text[1].count('Past')
    neut_count += text[1].count('Fut')
print(fem_count, masc_count, neut_count)

1500 1500 0
