In [3]:
import pandas as pd
import settings

In [4]:
wictionary_file_path = "resources/kaikki.org-dictionary-English.jsonl"


with open(wictionary_file_path, "r", encoding="utf-8") as f:
    for i in range(5):
        line = f.readline().strip()

In [5]:
import json

# Define file paths
wictionary_file_path = "resources/kaikki.org-dictionary-English.jsonl"
output_file = "resources/english_translations.jsonl"

# Process the file
with open(wictionary_file_path, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
    for line in f_in:
        try:
            entry = json.loads(line.strip())
            english_word = entry.get("word", "").lower()
            pos = entry.get("pos", "NOPOS").lower()
            raw_glosses = []
            
            if "senses" not in entry:
                continue

            for sense in entry["senses"]:
                if "raw_glosses" not in sense or "translations" not in sense:
                    continue
                
                translations = []
                for translation in sense["translations"]:
                    lang = translation.get("lang", "").lower()
                    word = translation.get("word", "").lower()
                    if lang and word:
                        translations.append({"lang": lang, "word": word})
                        raw_glosses = settings.SEPERATOR.join(sense["raw_glosses"]).lower()

                if not translations:
                    continue

                json_line = json.dumps(
                    {
                        "word": english_word, 
                        "pos": pos,
                        "translations": translations,
                        "raw_glosses": raw_glosses
                    }, 
                    ensure_ascii=False)
                f_out.write(json_line + "\n")
        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {e}")

print(f"Success! Saved all words in {output_file}.")

Success! Saved all words in resources/english_translations.jsonl.


In [7]:
english_translations = "resources/english_translations.jsonl"

# Load the data
data = []
with open(english_translations, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line.strip()))

print(f"Loaded {len(data)} entries.")

Loaded 81315 entries.


In [8]:
for entry_count, entry in enumerate(data[:5], 1):  # Print the first 5 entries
    print(f"Entry {entry_count}: {entry}")

Entry 1: {'word': 'free', 'pos': 'adj', 'translations': [{'lang': 'afrikaans', 'word': 'vrye'}, {'lang': 'albanian', 'word': 'lirë (i/e)'}, {'lang': 'amharic', 'word': 'ነፃ'}, {'lang': 'arabic', 'word': 'حُرّ'}, {'lang': 'arabic', 'word': 'حر'}, {'lang': 'arabic', 'word': 'حر'}, {'lang': 'armenian', 'word': 'արձակ'}, {'lang': 'armenian', 'word': 'ազատ'}, {'lang': 'aromanian', 'word': 'lefter'}, {'lang': 'assamese', 'word': 'মুকলি'}, {'lang': 'asturian', 'word': 'llibre'}, {'lang': 'azerbaijani', 'word': 'azad'}, {'lang': 'bambara', 'word': 'hɔrɔn'}, {'lang': 'bashkir', 'word': 'ирекле'}, {'lang': 'bashkir', 'word': 'азат'}, {'lang': 'bashkir', 'word': 'бәйһеҙ'}, {'lang': 'bashkir', 'word': 'хөр'}, {'lang': 'belarusian', 'word': 'свабо́дны'}, {'lang': 'belarusian', 'word': 'во́льны'}, {'lang': 'bengali', 'word': 'মুক্ত'}, {'lang': 'bengali', 'word': 'আজাদ'}, {'lang': 'bikol central', 'word': 'talingkas'}, {'lang': 'bulgarian', 'word': 'свобо́ден'}, {'lang': 'catalan', 'word': 'lliure'}, 

In [9]:
rows = []
for entry in data:
    english_word = entry["word"]
    pos = entry["pos"]
    raw_glosses = entry["raw_glosses"]
    translations = entry["translations"]

    for translation in translations:
        lang = translation["lang"]
        word = translation["word"]
        rows.append({
            "english_word": english_word,
            "pos": pos,
            "language": lang,
            "translation": word,
            "raw_glosses": raw_glosses
        })


In [13]:
df = pd.DataFrame(rows)

In [14]:
pd.set_option('display.max_colwidth', None)
df.head(10)

Unnamed: 0,english_word,pos,language,translation,raw_glosses
0,free,adj,afrikaans,vrye,(social) unconstrained.;;not imprisoned or enslaved.
1,free,adj,albanian,lirë (i/e),(social) unconstrained.;;not imprisoned or enslaved.
2,free,adj,amharic,ነፃ,(social) unconstrained.;;not imprisoned or enslaved.
3,free,adj,arabic,حُرّ,(social) unconstrained.;;not imprisoned or enslaved.
4,free,adj,arabic,حر,(social) unconstrained.;;not imprisoned or enslaved.
5,free,adj,arabic,حر,(social) unconstrained.;;not imprisoned or enslaved.
6,free,adj,armenian,արձակ,(social) unconstrained.;;not imprisoned or enslaved.
7,free,adj,armenian,ազատ,(social) unconstrained.;;not imprisoned or enslaved.
8,free,adj,aromanian,lefter,(social) unconstrained.;;not imprisoned or enslaved.
9,free,adj,assamese,মুকলি,(social) unconstrained.;;not imprisoned or enslaved.


In [15]:
# Anzahl der Einträge pro Sprache
print(df["language"].value_counts())

# Anzahl der Einträge pro englischem Wort
print(df["english_word"].value_counts())

language
finnish    70145
german     61648
russian    53986
spanish    51580
french     49296
           ...  
domari         1
dobu           1
djimini        1
dizin          1
magyar         1
Name: count, Length: 3996, dtype: int64
english_word
water             4017
dog               1061
rain               971
order              927
fish               896
                  ... 
almost surely        1
alacrimia            1
dut                  1
apartheid wall       1
hypertoxic           1
Name: count, Length: 57004, dtype: int64


In [16]:
lang_long = df['language'].unique()
lang_long = pd.DataFrame(lang_long, columns=['language'])
lang_long

Unnamed: 0,language
0,afrikaans
1,albanian
2,amharic
3,arabic
4,armenian
...,...
3991,church slavonic
3992,shaozhou tuhua
3993,finish
3994,ulster scots


In [17]:
lang_long = lang_long[lang_long['language'].str.contains(',') | lang_long['language'].str.contains(r'\d')]

lang_long

for lang in lang_long["language"]:
    print(lang)

In [18]:
df['is_long'] = df["language"].apply(lambda x: x in list(lang_long["language"]))
cleaned = df[df["is_long"] == False]

print(df.shape[0], cleaned.shape[0])


1195237 1195237


In [19]:
cleaned.head(100)

Unnamed: 0,english_word,pos,language,translation,raw_glosses,is_long
0,free,adj,afrikaans,vrye,(social) unconstrained.;;not imprisoned or enslaved.,False
1,free,adj,albanian,lirë (i/e),(social) unconstrained.;;not imprisoned or enslaved.,False
2,free,adj,amharic,ነፃ,(social) unconstrained.;;not imprisoned or enslaved.,False
3,free,adj,arabic,حُرّ,(social) unconstrained.;;not imprisoned or enslaved.,False
4,free,adj,arabic,حر,(social) unconstrained.;;not imprisoned or enslaved.,False
...,...,...,...,...,...,...
95,free,adj,polish,swobodny,(social) unconstrained.;;not imprisoned or enslaved.,False
96,free,adj,portuguese,livre,(social) unconstrained.;;not imprisoned or enslaved.,False
97,free,adj,quechua,qispi,(social) unconstrained.;;not imprisoned or enslaved.,False
98,free,adj,romagnol,lèbar,(social) unconstrained.;;not imprisoned or enslaved.,False


In [20]:
cleaned["word_and_gloss"] = cleaned["english_word"] + settings.SEPERATOR + cleaned["raw_glosses"]
cleaned["language_and_translation"] = cleaned["language"] + settings.SEPERATOR + cleaned["translation"] + settings.SEPERATOR + cleaned["pos"]
cleaned.head()

Unnamed: 0,english_word,pos,language,translation,raw_glosses,is_long,word_and_gloss,language_and_translation
0,free,adj,afrikaans,vrye,(social) unconstrained.;;not imprisoned or enslaved.,False,free;;(social) unconstrained.;;not imprisoned or enslaved.,afrikaans;;vrye;;adj
1,free,adj,albanian,lirë (i/e),(social) unconstrained.;;not imprisoned or enslaved.,False,free;;(social) unconstrained.;;not imprisoned or enslaved.,albanian;;lirë (i/e);;adj
2,free,adj,amharic,ነፃ,(social) unconstrained.;;not imprisoned or enslaved.,False,free;;(social) unconstrained.;;not imprisoned or enslaved.,amharic;;ነፃ;;adj
3,free,adj,arabic,حُرّ,(social) unconstrained.;;not imprisoned or enslaved.,False,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حُرّ;;adj
4,free,adj,arabic,حر,(social) unconstrained.;;not imprisoned or enslaved.,False,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حر;;adj


In [21]:
cleaned = cleaned.drop(columns=["english_word","translation", "is_long", "raw_glosses", "language", "pos"])
cleaned = cleaned.drop_duplicates()
cleaned = cleaned.reset_index(drop=True)
cleaned

Unnamed: 0,word_and_gloss,language_and_translation
0,free;;(social) unconstrained.;;not imprisoned or enslaved.,afrikaans;;vrye;;adj
1,free;;(social) unconstrained.;;not imprisoned or enslaved.,albanian;;lirë (i/e);;adj
2,free;;(social) unconstrained.;;not imprisoned or enslaved.,amharic;;ነፃ;;adj
3,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حُرّ;;adj
4,free;;(social) unconstrained.;;not imprisoned or enslaved.,arabic;;حر;;adj
...,...,...
1188486,fawn response;;(psychology) an overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,finnish;;miellyttämisreaktio;;noun
1188487,fawn response;;(psychology) an overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,german;;bambi-reflex;;noun
1188488,fawn response;;(psychology) an overadaptation in response to a traumatic event entailing needs and wants being succumbed to those of the threat actor.,german;;unterwerfungsreaktion;;noun
1188489,"one heart, one soul;;(philippines, catholicism) a saying that is said in every prayer, before the sign of the cross.",latin;;cor ūnum et anima ūna;;phrase


In [23]:
cleaned.to_parquet("resources/cleaned_dataframe.parquet", engine="pyarrow")

In [24]:
len(cleaned['word_and_gloss'].unique())

81312