In [1]:
import polars as pl
import spacy

In [2]:
DATA_PATH = "../../data"

In [3]:
nlp = spacy.load("en_core_web_sm")
with open(f"{DATA_PATH}/english-word-hx/words.txt", "r") as file:
    english_vocab = set(word.strip().lower() for word in file)

In [4]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")

In [5]:
def _count_spelling_errors(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]
    spelling_errors = sum(
        1 for token in lemmatized_tokens if token not in english_vocab
    )
    return spelling_errors


def count_spelling_errors() -> pl.Expr:
    return (
        pl.col("full_text")
        .map_elements(lambda x: _count_spelling_errors(x), return_dtype=pl.Int64)
        .alias("spelling_errors_cnt")
    )


train = train.with_columns(count_spelling_errors())

In [6]:
import json

with open("../../data/essay_id_spelling_errors_cnt.json", "w") as f:
    json.dump(dict(zip(train["essay_id"], train["spelling_errors_cnt"])), f)