In [9]:
import os

LANGS = [x.strip() for x in os.getenv("LANGS", "en,ro").split(",") if x.strip()]
OUT_DIR = os.getenv("OUT_DIR", "data/dump").strip()

print("params:", LANGS, OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

params: ['en', 'ro'] data/dump


## Get spacy models

In [10]:
# for Turkish, we'd have to run a separate pipeline with a different spacy version
#! pip install "tr_core_news_lg @ https://huggingface.co/turkish-nlp-suite/tr_core_news_lg/resolve/main/tr_core_news_lg-1.0-py3-none-any.whl"

In [11]:
!python -m spacy download xx_sent_ud_sm

language_models = {
    "en": "en_core_web_lg",
    "ro": "ro_core_news_lg",
    "de": "de_core_news_lg",
    "fr": "fr_core_news_lg",
    "it": "it_core_news_lg",
    "es": "es_core_news_lg",
    "nl": "nl_core_news_lg",
    "pt": "pt_core_news_lg",
    "sv": "sv_core_news_lg",
    "pl": "pl_core_news_lg",
    "ru": "ru_core_news_lg",
    "uk": "uk_core_news_lg",
    "el": "el_core_news_lg",
    "hr": "hr_core_news_lg",
    "lt": "lt_core_news_lg",
    "mk": "mk_core_news_lg",
    "sl": "sl_core_news_lg",
    "ca": "ca_core_news_lg",
    "zh": "zh_core_web_lg",
}

for lm in LANGS:
    if lm in language_models:
        print(f"downloading {lm}: {language_models[lm]}")
        !python -m spacy download {language_models[lm]}
    else:
        print(f"use blank")

Collecting xx-sent-ud-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl (4.3 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_sent_ud_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
downloading en: en_core_web_lg
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation succ

## Get the MultiplEYE json data

In [12]:
! rm -rf languages*
! wget https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
! unzip languages_json_all.zip

--2025-09-30 03:42:36--  https://github.com/senisioi/repository/releases/download/eyelanguages0/languages_json_all.zip
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/930203766/ec269415-b6bd-4aaa-b90d-9ab5aded3f27?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-09-30T04%3A38%3A20Z&rscd=attachment%3B+filename%3Dlanguages_json_all.zip&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-09-30T03%3A37%3A47Z&ske=2025-09-30T04%3A38%3A20Z&sks=b&skv=2018-11-09&sig=tmdiSKEkjlxqyA%2FgoxBD6rdchupOA%2FOXl045D%2FHC8%2F0%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc1OTIwNDA1NiwibmJmIjoxNzU5MjAzNzU2LCJwYXRoIjoicmVsZ

In [13]:
SPACY_LANGUAGES = ["ca", "de", "el", "en", "es", "fr", "hr", "it", "lt", "mk", "nl", "pl", "pt", "ro", "ru", "sl", "sv", "uk", "zh"]

CODE2LANG = {
    "ar": "Arabic",
    "ca": "Catalan",
    "cs": "Czech",
    "de": "German",
    "gsw": "Swiss German",
    "el": "Greek",
    "en": "English",
    #"es": "Spanish",
    "et": "Estonian",
    "eu": "Basque",
    #"fr": "French",
    #"he": "Hebrew",
    "hi": "Hindi",
    "hr": "Croatian",
    "it": "Italian",
    "kl": "Kalaallisut",
    "lt": "Lithuanian",
    "lv": "Latvian",
    "mk": "Macedonian",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "rm": "Romansh",
    "ro": "Romanian",
    "ru": "Russian",
    "sl": "Slovenian",
    "sq": "Albanian",
    "sv": "Swedish",
    "tr": "Turkish",
    "uk": "Ukrainian",
    #"yue": "Cantonese",
    "zh": "Chinese"
}

LANGUAGES = list(CODE2LANG.keys())

## Load data

In [14]:
import os
import json
import spacy

def load_all_json(lang_folder):
    all_data = {}
    for file in os.listdir(lang_folder):
        if file.endswith('.json'):
            lang_code = file.replace('.json', '').replace('multipleye_stimuli_experiment_', '')
            if lang_code == 'zd':
                lang_code = 'gsw'
            if (lang_code not in LANGUAGES) or (lang_code not in LANGS):
                continue
            with open(os.path.join(lang_folder, file), 'r', encoding='utf-8') as f:
                all_data[lang_code] = json.load(f)
    return all_data

In [15]:
all_data = load_all_json('languages_json')
for k,v in all_data.items():
  print(k, v[0])

ro {'stimulus_id': 1, 'stimulus_name': 'PopSci_MultiplEYE', 'stimulus_type': 'experiment', 'pages': ['Proiectul MultiplEYE\n\nNumele „MultiplEYE” este un joc de cuvinte care combină „multilingvism” sau „limbi multiple” cu „eye” (ochi) din „eye-tracking” (urmărire oculară). MultiplEYE este o Acțiune COST finanțată de Uniunea Europeană. Acțiunile COST sunt rețele de cercetare sprijinite de Cooperarea Europeană în Știință și Tehnologie, pe scurt COST. Ca organizație de finanțare, COST susține rețeaua noastră în creștere de cercetători din Europa și din afara ei, oferind sprijin financiar pentru desfășurarea diverselor activități de networking.', 'Aceste activități includ întâlniri ale grupurilor de lucru, școli de formare pentru a împărtăși abilități cu cercetătorii mai tineri și vizite științifice de cercetare. Titlul proiectului Acțiunii COST MultiplEYE este: Facilitarea colectării datelor de urmărire oculară în mai multe limbi pentru cercetarea procesării limbajului de către om și a pr

## Prepare spaCy code to generate template csv files

In [16]:

LANG_FOLDER = "languages_json"
NLP_MODEL = None
CURRENT_LANG = ''
IN_DIR = 'languages_json/'

from spacy.util import get_lang_class


def exists_spacy_blank(lang_code):
    try:
        get_lang_class(lang_code)
        return True
    except:
        return False

def load_spacy_model(lang_code, small=True):
    model = None
    if lang_code in SPACY_LANGUAGES:
        genre = 'news'
        if lang_code in {'zh', 'en'}:
            genre = 'web'
        if lang_code == 'rm':
            return ''
        model_name = f'{lang_code}_core_{genre}_{"sm" if small else "lg"}'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    elif lang_code == "rm":
        model = spacy.load("it_core_news_lg")
        # keep 'morphologizer' ?
        model.disable_pipes('tok2vec', 'tagger', 'parser', 'lemmatizer', 'attribute_ruler', 'ner')
    elif lang_code == 'gsw':
        model = spacy.load('de_core_news_lg')
    elif exists_spacy_blank(lang_code):
        print(f"Loading model blank model for {lang_code}")
        model = spacy.blank(lang_code)
        model.add_pipe("sentencizer")
    else:
        model_name = f'xx_sent_ud_sm'
        print(f"Loading model {model_name} for {lang_code}")
        model = spacy.load(model_name)
        model.add_pipe("sentencizer")
    return model


def get_nlp(lang_code, small=False):
    """To avoid loading all models at the same time
    """
    global NLP_MODEL, CURRENT_LANG
    if lang_code != CURRENT_LANG:
        try:
            print(f"Deleting model for {CURRENT_LANG}")
            del NLP_MODEL
        except:
            print("No model to delete")
        print(f"Loading model for {lang_code}")
        NLP_MODEL = load_spacy_model(lang_code, small=small)
        CURRENT_LANG = lang_code
    return NLP_MODEL


In [17]:
def feats_str(token):
    if not token.morph:
        return "_"
    md = token.morph.to_dict()
    if not md:
        return "_"
    bits = []
    for k in sorted(md):
        v = md[k]
        if isinstance(v, (list, tuple)):
            bits.append(f"{k}={','.join(v)}")
        else:
            bits.append(f"{k}={v}")
    return "|".join(bits) if bits else "_"


def get_head(token, sent):
    if token.head == token or token.dep_ == "ROOT":
        head = 0
        deprel = "root"
    else:
        head = (token.head.i - sent.start) + 1  # 1-based in sentence
        deprel = token.dep_.lower() if token.dep_ else "_"
    return head, deprel


def get_misc(token, include_ner=True):
    misc_parts = []
    if not token.whitespace_:
        misc_parts.append("SpaceAfter=No")
    if include_ner and token.ent_iob_ != "O":
        misc_parts.append(f"NER={token.ent_iob_}-{token.ent_type_}")
    misc = "|".join(misc_parts) if misc_parts else "_"
    return misc


def iter_pages(stimuli, nlp):
    for stim in stimuli:
        sid, sname = stim["stimulus_id"], stim["stimulus_name"]
        for pnum, page_text in enumerate(stim["pages"], start=1):
            yield sid, sname, pnum, nlp(page_text)

def stimuli2csv(stimuli, lang_code, level="page", small=False):
    rows = []
    nlp = get_nlp(lang_code, small=small)
    for sid, sname, page, doc in iter_pages(stimuli, nlp):
        ptext = doc.text
        document = nlp(ptext)
        for sent_idx, sentence in enumerate(document.sents):
            eos = {
              "language": CODE2LANG[lang_code],
              "language_code": lang_code,
              "stimulus_name": sname,
              "page": page,
              #"sent_idx": sent_idx+1,
              "token": "<eos>",
              "is_alpha": False,
              "is_stop": False,
              "is_punct": False,
              "lemma": "",
              "upos": "",
              "xpos": "",
              "feats": "",
              "head": "",
              "deprel": "",
              "deps": "",
              "misc": ""
              }
            for token in sentence:
                head, deprel = get_head(token, sentence)
                rows.append(
                    {
                        #"stimulus_id": sid,
                        "language": CODE2LANG[lang_code],
                        "language_code": lang_code,
                        "stimulus_name": sname,
                        "page": page,
                        #"sent_idx": sent_idx+1,
                        "token": token.text,
                        "is_alpha": token.is_alpha,
                        "is_stop": token.is_stop,
                        "is_punct": token.is_punct,
                        "lemma": token.lemma_,
                        "upos": token.pos_,
                        "xpos": token.tag_,
                        "feats": feats_str(token),
                        "head": head,
                        "deprel": deprel,
                        "deps": "_",
                        "misc": get_misc(token, include_ner=True)
                    }
                )
            rows.append(eos)


    df = pd.DataFrame(rows).sort_values(by=["stimulus_name", "page"])
    df = pd.DataFrame(rows)
    return df

## Generate csv templates

In [18]:
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

preproc = defaultdict(dict)
for lang_code, data in tqdm(all_data.items()):
    if lang_code not in LANGS:
        continue
    preproc[lang_code] = stimuli2csv(data, lang_code, small=False)

  0%|          | 0/2 [00:00<?, ?it/s]

Deleting model for 
Loading model for ro
Loading model ro_core_news_lg for ro


 50%|█████     | 1/2 [00:08<00:08,  8.90s/it]

Deleting model for ro
Loading model for en
Loading model en_core_web_lg for en


100%|██████████| 2/2 [00:16<00:00,  8.43s/it]


## Save

In [19]:
import os
from tqdm import tqdm

for lang_code, df in tqdm(preproc.items()):
    lang_out = 'gsw' if lang_code == 'zd' else lang_code
    out_dir = os.path.join(OUT_DIR, lang_out)
    os.makedirs(out_dir, exist_ok=True)

    for stim_name, group in df.groupby('stimulus_name'):
        out_fis = os.path.join(out_dir, f"{stim_name}.csv")
        g = group.copy()
        g['language_code'] = lang_out
        g.to_csv(out_fis, index=False)
        print(out_fis)

100%|██████████| 2/2 [00:00<00:00, 13.38it/s]

data/dump/ro/Arg_PISACowsMilk.csv
data/dump/ro/Arg_PISARapaNui.csv
data/dump/ro/Enc_WikiMoon.csv
data/dump/ro/Ins_HumanRights.csv
data/dump/ro/Ins_LearningMobility.csv
data/dump/ro/Lit_Alchemist.csv
data/dump/ro/Lit_BrokenApril.csv
data/dump/ro/Lit_MagicMountain.csv
data/dump/ro/Lit_NorthWind.csv
data/dump/ro/Lit_Solaris.csv
data/dump/ro/PopSci_Caveman.csv
data/dump/ro/PopSci_MultiplEYE.csv
data/dump/en/Arg_PISACowsMilk.csv
data/dump/en/Arg_PISARapaNui.csv
data/dump/en/Enc_WikiMoon.csv
data/dump/en/Ins_HumanRights.csv
data/dump/en/Ins_LearningMobility.csv
data/dump/en/Lit_Alchemist.csv
data/dump/en/Lit_BrokenApril.csv
data/dump/en/Lit_MagicMountain.csv
data/dump/en/Lit_NorthWind.csv
data/dump/en/Lit_Solaris.csv
data/dump/en/PopSci_Caveman.csv
data/dump/en/PopSci_MultiplEYE.csv



