In [8]:
import polars as pl
import spacy

In [9]:
DATA_PATH = "../../data"

In [10]:
nlp = spacy.load("en_core_web_sm")
with open(f"{DATA_PATH}/english-word-hx/words.txt", "r") as file:
    english_vocab = set(word.strip().lower() for word in file)

In [11]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")

In [12]:
from spellchecker import SpellChecker
import re

spellchecker = SpellChecker()
NUM_SYMBOL_PARTTERN = re.compile(r"^[0-9,./]+$")


def _count_spelling_errors(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]
    misspelled = spellchecker.unknown(lemmatized_tokens)

    misspelled = [
        text
        for text in misspelled
        if (text not in ["\n\n", "'s", "’s"])
        and (not text.isspace())
        and not NUM_SYMBOL_PARTTERN.match(text)
    ]

    return len(misspelled)


def count_spelling_errors() -> pl.Expr:
    return (
        pl.col("full_text")
        .map_elements(lambda x: _count_spelling_errors(x), return_dtype=pl.Int64)
        .alias("spelling_errors_cnt")
    )


train = train.with_columns(count_spelling_errors())

In [13]:
train

essay_id,full_text,score,spelling_errors_cnt
str,str,i64,i64
"""000d118""","""Many people have car where the…",3,22
"""000fe60""","""I am a scientist at NASA that …",3,5
"""001ab80""","""People always wish they had th…",4,7
"""001bdc0""","""We all heard about Venus, the …",4,6
"""002ba53""","""Dear, State Senator This is a…",3,10
…,…,…,…
"""ffd378d""","""the story "" The Challenge of E…",2,12
"""ffddf1f""","""Technology has changed a lot o…",4,19
"""fff016d""","""If you don't like sitting arou…",2,3
"""fffb49b""","""In ""The Challenge of Exporing …",1,6


In [14]:
import json

with open("../../data/essay_id_spelling_errors_cnt_pyspell.json", "w") as f:
    json.dump(dict(zip(train["essay_id"], train["spelling_errors_cnt"])), f)