#### Загрузка данных

In [1]:
from pathlib import Path
import requests

In [3]:
data_dir = Path.cwd() / "data"
serb_train = data_dir / "sr_set-ud-train.conllu"

if not data_dir.exists():
    data_dir.mkdir()

if not serb_train.exists():
    
    url = r"https://raw.githubusercontent.com/UniversalDependencies/UD_Serbian-SET/refs/heads/master/sr_set-ud-train.conllu"

    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()

    except requests.exceptions.RequestException as e:
        print(f"Download failed: {e}")

    else:
        with open(serb_train, "w", encoding="utf-8") as file:
            file.write(response.text)


### Подготовка данных

In [4]:
from nltk.parse import DependencyGraph
import warnings
warnings.filterwarnings('ignore')

In [5]:
with open(serb_train, "r", encoding="utf-8") as file:
    data = file.read()

In [6]:
sents = [sent for sent in data.split("\n\n") if sent.strip()]

In [7]:
len(sents)

3328

In [9]:
sents = [
    "\n".join(
        filter(
            lambda line: line.strip()[0] != "#",
            sent.split("\n")
        )
    )
    for sent
    in sents
]


In [11]:
trees = [
    DependencyGraph(sent)
    for sent
    in sents
]

In [13]:
class Node:
    def __init__(self, tree_id: int, node_id: int) -> None:
        self.tree_id = tree_id
        self.node_id = node_id

    def __repr__(self) -> str:
        return f"{self.__class__.__name__}(tree={self.tree_id}, node={self.node_id})"


In [14]:
verbs: dict[
    str,
    dict[
        str,
        list[
            dict[
                str,
                str
            ]
        ]
    ]
] = {}

distinct_verbs: set[str] = set()

for i, tree in enumerate(trees):
    for key in tree.nodes:
        node = tree.nodes[key]
        ctag = node.get("ctag", None)
        if ctag == "VERB" :
            lemma = node["lemma"]
            forms_dict =  verbs.get(lemma, dict())

            word = node["word"]
            forms_dict[word] = forms_dict.get(word, list()) + [Node(i, key)]

            verbs[lemma] = forms_dict

            distinct_verbs.add(word)


In [15]:
from collections import Counter

In [16]:
verbs_counts = Counter(
    {
        lemma: sum(
            [
                len(verbs[lemma][form])
                for
                    form
                in
                    verbs[lemma]
            ]
        )
        for
            lemma
        in
            verbs
    }
)

In [19]:
print(f"""
    Кол-во лемм: {len(verbs)}
    Кол-во уникальных форм: {len(distinct_verbs)}
    Всего глаголов: {sum(verbs_counts.values())}
    """
)


    Кол-во лемм: 1151
    Кол-во уникальных форм: 2624
    Всего глаголов: 6406
    


In [28]:
def get_verb(trees: list[dict], verb: Node) -> dict:
    return trees[verb.tree_id].nodes[verb.node_id]

In [None]:
## TODO:

#
#  + 1. собрать 150 глаголов сов в и 150 несов в ГОТОВО (verbs_data.csv)
#  - 2. категоризовать актанты
#  - 3. тип субъекта и объекта
#  - 4. obl nsubj iboj
#  - 5. наличие отрицания
#  - 6. наречия
#  - 7. все глагольные теги
#  - 8. окна (по 4 слева-справа) или предложения целиком
#  
