In [1]:
import csv
import os
import subprocess
from string import punctuation
import pandas as pd

import spacy
import spacy_transformers
from wordfreq import zipf_frequency


In [2]:
def count_csv_elements_in_file(filepath):

    total_elements = 0

    with open(filepath, "r", encoding="utf-8") as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            total_elements += len(row)

    return total_elements

language = []
total_words = []

for path, subdirs, files in os.walk("../raw-word-list"):
    for name in files:
        filename = (os.path.join(path, name))
        language += [name.split(".")[0]]
        total_words += [count_csv_elements_in_file(filename)]


In [3]:
pd.DataFrame({
    "language": language,
    "total_words": total_words
})

Unnamed: 0,language,total_words
0,Catalan,3585
1,Chinese,406588
2,Croatian,3766
3,Danish,1800765
4,Dutch,173556
5,English,466434
6,Finnish,91672
7,French,336528
8,German,1707903
9,Greek,35279


In [15]:
spacy_models = {
    "Catalan": "ca_core_news_sm",
    "Chinese": "zh_core_web_sm",
    "Croatian": "hr_core_news_sm",
    "Danish": "da_core_news_sm",
    "Dutch": "nl_core_news_sm",
    "English": "en_core_web_sm",
    "Finnish": "fi_core_news_sm",
    "French": "fr_core_news_sm",
    "German": "de_core_news_sm",
    "Greek": "el_core_news_sm",
    "Italian": "it_core_news_sm",
    "Japanese": "ja_core_news_sm",
    "Korean": "ko_core_news_sm",
    "Lithuanian": "lt_core_news_sm",
    "Macedonian": "mk_core_news_sm",
    "Norwegian": "nb_core_news_sm",
    "Polish": "pl_core_news_sm",
    "Portuguese": "pt_core_news_sm",
    "Romanian": "ro_core_news_sm",
    "Russian": "ru_core_news_sm",
    "Slovenian": "sl_core_news_sm",
    "Spanish": "es_core_news_sm",
    "Swedish": "sv_core_news_sm",
    "Ukrainian": "uk_core_news_sm",
}


In [14]:
for model in spacy_models.values():
    result = subprocess.run(['uv','run','python', '-m', 'spacy', 'download', f'{model}'],
                       capture_output=True, text=True)
    print(result.stdout)
    print(result.stderr)

Collecting ca-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ca_core_news_sm-3.8.0/ca_core_news_sm-3.8.0-py3-none-any.whl (19.6 MB)
     ---------------------------------------- 0.0/19.6 MB ? eta -:--:--
     ---------------------------------------- 0.0/19.6 MB ? eta -:--:--
      --------------------------------------- 0.3/19.6 MB ? eta -:--:--
     - -------------------------------------- 0.5/19.6 MB 1.0 MB/s eta 0:00:19
     - ------------------------------------- 0.8/19.6 MB 987.0 kB/s eta 0:00:20
     - ------------------------------------- 0.8/19.6 MB 987.0 kB/s eta 0:00:20
     -- ------------------------------------ 1.0/19.6 MB 849.0 kB/s eta 0:00:22
     -- ------------------------------------ 1.3/19.6 MB 933.8 kB/s eta 0:00:20
     -- ------------------------------------ 1.3/19.6 MB 933.8 kB/s eta 0:00:20
     --- ----------------------------------- 1.6/19.6 MB 914.1 kB/s eta 0:00:20
     --- ----------------------------------- 1

In [18]:
for language in spacy_models.keys():
    try:
        os.mkdir(f"../data/{language}")
        print(f"Directory {language} created")
    except FileExistsError:
        print(f"Directory {language} already exists")

Directory Catalan already exists
Directory Chinese already exists
Directory Croatian already exists
Directory Danish already exists
Directory Dutch already exists
Directory English already exists
Directory Finnish already exists
Directory French already exists
Directory German already exists
Directory Greek already exists
Directory Italian already exists
Directory Japanese already exists
Directory Korean already exists
Directory Lithuanian already exists
Directory Macedonian already exists
Directory Norwegian already exists
Directory Polish already exists
Directory Portuguese already exists
Directory Romanian already exists
Directory Russian already exists
Directory Slovenian already exists
Directory Spanish already exists
Directory Swedish already exists
Directory Ukrainian already exists


In [21]:
def load_and_clean_word_list(language: str) -> pd.DataFrame:
    with open(f"../raw-word-list/{language}/{language}.txt", "r", encoding="utf-8") as f:
        word_list = f.read().split(",")

    word_df = pd.DataFrame({
        "word": word_list
    })

    word_df["word"] = word_df["word"].str.strip(punctuation)

    return word_df

In [22]:
load_and_clean_word_list("English")

Unnamed: 0,word
0,aardvark
1,aardvarks
2,aardwolf
3,aardwolves
4,Aaren
...,...
466429,Zwolle
466430,Zworykin
466431,ZZ
466432,zZt


In [23]:
nlp = spacy.load(spacy_models["English"], disable=["parser", "ner", "textcat"])

In [24]:
def add_lemma(df: pd.DataFrame,
              nlp,
              batch_size: int = 1000) -> pd.DataFrame:
    docs = nlp.pipe(df["word"].tolist(), batch_size=batch_size)
    lemmas = [doc[0].lemma_ for doc in docs]
    df["lemma"] = pd.DataFrame(lemmas, index=df.index)
    return df

In [25]:
add_lemma(
    load_and_clean_word_list("English")[:100], nlp
)

Unnamed: 0,word,lemma
0,aardvark,aardvark
1,aardvarks,aardvark
2,aardwolf,aardwolf
3,aardwolves,aardwolf
4,Aaren,Aaren
...,...,...
95,Abama,Abama
96,abamp,abamp
97,abampere,abampere
98,abamperes,abampere


In [26]:
def add_word_frequencies(df: pd.DataFrame,
                         language: str) -> pd.DataFrame:
    language_group = spacy_models[language].split("_")[0]
    df["zipf_freq_lemma"] = [zipf_frequency(w, language_group) for w in df["lemma"]]
    return df

In [27]:
add_word_frequencies(
    add_lemma(
        load_and_clean_word_list("English")[:100], nlp
), "English")

Unnamed: 0,word,lemma,zipf_freq_lemma
0,aardvark,aardvark,2.39
1,aardvarks,aardvark,2.39
2,aardwolf,aardwolf,1.11
3,aardwolves,aardwolf,1.11
4,Aaren,Aaren,0.00
...,...,...,...
95,Abama,Abama,0.00
96,abamp,abamp,0.00
97,abampere,abampere,0.00
98,abamperes,abampere,0.00


In [28]:
def clean_up_and_export(df: pd.DataFrame, language: str) -> None:
    df = (
        df.loc[df.groupby("lemma", sort=False)["zipf_freq_lemma"].idxmax()]
        .reset_index(drop=True)
    )

    df = df[(df["zipf_freq_lemma"] > 0)]

    df.loc[:, "word_difficulty"] = pd.cut(
        df["zipf_freq_lemma"],
        bins = [-float("inf"), 2.0, 4.0, float("inf")],
        labels = ["advanced", "intermediate", "beginner"],
        include_lowest = True,
        right = True
    )

    df = df.drop(columns=["word", "zipf_freq_lemma"])
    df = df.rename(columns = {
        "lemma": "word"
    })

    df.to_json(f"../data/{language}/word-list-cleaned.json", orient="index")

In [29]:
clean_up_and_export(
    add_word_frequencies(
        add_lemma(
            load_and_clean_word_list("English")[:100], nlp),
        "English"),
    "English")

In [30]:
def create_clean_word_list(language: str) -> None:
    nlp = spacy.load(spacy_models[language], disable=["parser", "ner", "textcat"])

    print("Load in dataset")
    lang_df = load_and_clean_word_list(language)

    print("Lemmatise words")
    lang_df = add_lemma(lang_df, nlp)

    print("Add the word frequencies")
    lang_df = add_word_frequencies(lang_df, language)

    print("Do the final clean ups and export to file")
    clean_up_and_export(lang_df, language)

    return None

In [31]:
create_clean_word_list("Spanish")

Load in dataset
Lemmatise words
Add the word frequencies
Do the final clean ups and export to file


In [35]:
import json

language_raw = []
total_words_raw = []

for path, subdirs, files in os.walk("../raw-word-list"):
    for name in files:
        filename = (os.path.join(path, name))
        language_raw += [name.split(".")[0]]
        total_words_raw += [count_csv_elements_in_file(filename)]

raw_data = pd.DataFrame({
    "language": language_raw,
    "type": ["Raw"] * len(language_raw),
    "total_words_raw": total_words_raw,
})

In [37]:
language_clean = []
total_words_clean = []

for path, subdirs, files in os.walk("../data"):
    for name in files:
        filename = (os.path.join(path, name))
        language_clean += [path.split("/")[1]]
        with open(filename, "r", encoding="utf-8") as f:
            data = json.load(f)
            total_words_clean += [len(data.keys())]

clean_data = pd.DataFrame({
    "language": language_clean,
    "type": ["Clean"] * len(language_clean),
    "total_words_raw": total_words_clean,
})

pd.concat([raw_data, clean_data])

Unnamed: 0,language,type,total_words_raw
0,Catalan,Raw,3585
1,Chinese,Raw,406588
2,Croatian,Raw,3766
3,Danish,Raw,1800765
4,Dutch,Raw,173556
5,English,Raw,466434
6,Finnish,Raw,91672
7,French,Raw,336528
8,German,Raw,1707903
9,Greek,Raw,35279
