## Загрузка данных

In [26]:
import pathlib
import requests

In [27]:
datadir = pathlib.Path().cwd() / "data"

In [28]:
if not datadir.exists():
    datadir.mkdir()

In [29]:
# Путь к raw файлу
ghb_url = "https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/refs/heads/master/"

In [30]:
filenames = [
    "ru_syntagrus-ud-train-a.conllu",
    "ru_syntagrus-ud-train-b.conllu",
    "ru_syntagrus-ud-train-c.conllu",
    "ru_syntagrus-ud-test.conllu"
]

In [31]:
for filename in filenames:
    path = datadir / filename
    if not path.exists():
        url = ghb_url + filename

        try:
            response = requests.get(url, timeout=15)
            response.raise_for_status()

        except requests.exceptions.RequestException as e:
            print(f"Download failed: {e}")

        else:
            with open(path, "w", encoding="utf-8") as file:
                file.write(response.text)

## Подготовка датасета

Функциональные зависимости нам тут не нужны, нужна чисто грамматика

In [47]:
def prep_entry(text:str):
    entry = tuple(filter(lambda l: not l.startswith("#"), text.split("\n")))
    entry = [line.split("\t") for line in entry]
    return entry

In [48]:
def get_all_grammar(entry: list[str]) -> dict[str, str]:

    pos = entry[3]
    feats = entry[5]

    result = [f"POS={pos}"]
                      
    if feats != "_":
         result += sorted(feats.split("|"))

    return "|".join(result)

In [49]:
train_path = datadir / "train.txt"

In [52]:
for name in filenames:
    source_path = datadir / name
    target_path = datadir / ("processed_" + name)

    if not target_path.exists():
        with open(target_path, "w", encoding="utf-8") as target_file, open(source_path, "r", encoding="utf-8") as source_file:
            data = source_file.read().split("\n\n")

            for text in data:

                if not text.strip():
                    continue

                entry = prep_entry(text)

                for line in entry:
                    target_file.write(f"{line[1]}\t{get_all_grammar(line)}\n")
                target_file.write("\n")