In [1]:
from random import shuffle
from tqdm import tqdm
import re
import os
import sys
sys.path.append('../rusenteval')

In [2]:
import stanza
ppln = stanza.Pipeline('ru', processors='tokenize,pos')

2023-11-19 02:19:00 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-19 02:19:01 INFO: Loading these models for language: ru (Russian):
| Processor | Package          |
--------------------------------
| tokenize  | syntagrus        |
| pos       | syntagrus_charlm |

2023-11-19 02:19:01 INFO: Using device: cpu
2023-11-19 02:19:01 INFO: Loading: tokenize
2023-11-19 02:19:01 INFO: Loading: pos
2023-11-19 02:19:02 INFO: Done loading processors!


In [3]:
txt = 'читает Прочитал'
doc = ppln(txt)
doc.sentences[0].words[1].upos

'VERB'

In [4]:
feats = doc.sentences[0].words[0].feats.split('|')
data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
data_dict

{'Aspect': 'Imp',
 'Mood': 'Ind',
 'Number': 'Sing',
 'Person': '3',
 'Tense': 'Pres',
 'VerbForm': 'Fin',
 'Voice': 'Act'}

In [5]:
aspects = []
def check_sents(text):
    text = re.sub(r'[^\w\s]','',text)[:-1]
    labels = []
    doc = ppln(text)
    for i in range(len(doc.sentences[0].words)):
        if doc.sentences[0].words[i].upos == 'VERB':
            feats = doc.sentences[0].words[i].feats.split('|')
            data_dict = {item.split('=')[0]: item.split('=')[1] for item in feats}
            if len(data_dict) > 2:
                if data_dict['VerbForm'] == 'Fin':
                    try:
                        labels.append(data_dict['Aspect'])
                        aspects.append(data_dict['Aspect'])
                    except KeyError:
                        labels.append('None')
                else:
                    labels.append('None')   
            else:
                    labels.append('None') 
        else:
            labels.append('None')
    return text, labels  

In [6]:
with open('../rusenteval/predicate_aspect.txt') as f:
    sents = [line.strip('\n').split('\t') for line in f]
shuffle(sents)
sents = [d[2] for d in sents if 4 < len(d[2].split()) <= 20][:40000]

In [30]:
set(aspects)

{'Imp', 'Perf'}

In [8]:
new_sents = []
for sent in tqdm(sents):
    new_sents.append(check_sents(sent))

100%|███████████████████████████████████| 40000/40000 [5:25:12<00:00,  2.05it/s]


In [9]:
new_sents = [d for d in new_sents if d is not None]
new_sents = [d for d in new_sents if d[1].count('None') != len(d[1]) and len(d[0].split()) == len(d[1])]

In [35]:
per_token = [d for d in new_sents if d[1].count('None') < len(d[1]) - 1 and len(d[0].split()) == len(d[1])]

In [36]:
per_sent = [d for d in new_sents if d[1].count('None') + 1 == len(d[1]) and len(d[0].split()) == len(d[1])]

In [37]:
len(per_token)

30

In [34]:
len(per_sent)

39824

## per_token

In [46]:
sg = []
pl = []
for text in per_sent:
    if text[1].count('Imp') > 0:
        sg.append(text)
        continue
    elif text[1].count('Perf') > 0:
        pl.append(text)

In [47]:
shuffle(sg)
shuffle(pl)
sg = sg[:1500]
pl = pl[:1500]

In [48]:
len(sg)

1500

In [49]:
sg.extend(pl)

In [50]:
len(sg)

3000

In [51]:
shuffle(sg)

In [52]:
# сохраним в файл
filename_out = '/per_token/ru_rusenteval_VERB_Aspect.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in sg:
        output.write(line[0])
        output.write('\n')
        output.write(str(line[1])[1:-1])
        output.write('\n')

In [53]:
sg_count = 0
pl_count = 0
for text in sg:
    sg_count += text[1].count('Imp')
    pl_count += text[1].count('Perf')
print(sg_count, pl_count)

1500 1500


## per_sent

In [38]:
sg = []
pl = []
for text in per_sent:
    if text[1].count('Imp') > 0:
        sg.append(text)
        continue
    elif text[1].count('Perf') > 0:
        pl.append(text)

In [39]:
shuffle(sg)
shuffle(pl)
sg = sg[:1500]
pl = pl[:1500]

In [40]:
len(sg)

1500

In [41]:
sg.extend(pl)

In [42]:
len(sg)

3000

In [43]:
shuffle(sg)

In [44]:
# сохраним в файл
filename_out = '/per_sent/ru_rusenteval_VERB_Aspect.txt'
with open(filename_out, 'w', encoding='UTF-8') as output:
    for line in sg:
        output.write(line[0])
        output.write('\n')
        label = [label for label in line[1] if label != 'None']
        assert len(label) == 1
        output.write(label[0])
        output.write('\n')

In [45]:
sg_count = 0
pl_count = 0
for text in sg:
    sg_count += text[1].count('Imp')
    pl_count += text[1].count('Perf')
print(sg_count, pl_count)

1500 1500
