In [1]:
!pip install pyspellchecker



In [2]:
import polars as pl
import spacy

In [3]:
DATA_PATH = "../data"

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
test = pl.read_csv(f"{DATA_PATH}/test_with_index.csv").with_columns(
    (pl.concat_str(["Title", "Review Text"], separator=" ")).alias("full_text")
)

In [6]:
from spellchecker import SpellChecker
import re

spellchecker = SpellChecker()
NUM_SYMBOL_PARTTERN = re.compile(r"^[0-9,./]+$")


def _count_spelling_errors(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]
    misspelled = spellchecker.unknown(lemmatized_tokens)

    misspelled = [
        text
        for text in misspelled
        if (text not in ["\n\n", "'s", "’s"])
        and (not text.isspace())
        and not NUM_SYMBOL_PARTTERN.match(text)
    ]

    return len(misspelled)


def count_spelling_errors() -> pl.Expr:
    return (
        pl.col("full_text")
        .map_elements(lambda x: _count_spelling_errors(x), return_dtype=pl.Int64)
        .alias("spelling_errors_cnt")
    )


test = test.with_columns(count_spelling_errors())

In [7]:
test = test.with_columns(pl.col("spelling_errors_cnt").fill_null(0))

In [8]:
test

index,Clothing ID,Age,Title,Review Text,Positive Feedback Count,full_text,spelling_errors_cnt
i64,i64,i64,str,str,i64,str,i64
0,0,32,"""So happy i bought this skirt!""","""I love this skirt. it does run…",0,"""So happy i bought this skirt! …",0
1,0,34,"""Runs small""","""Beautiful patterns and colors,…",0,"""Runs small Beautiful patterns …",0
2,0,37,"""Love the comfort of thi skirt""","""It is easily paired with a nic…",0,"""Love the comfort of thi skirt …",1
3,0,39,"""Way too small""","""This is a beautiful skirt, but…",10,"""Way too small This is a beauti…",0
4,0,39,,"""I usually wear a size 8/10 and…",0,,0
…,…,…,…,…,…,…,…
11150,232,53,"""More beautiful in reality""","""I purchased this dress on a wh…",0,"""More beautiful in reality I pu…",0
11151,232,58,"""Perfect dress!""","""This dress is very flattering …",0,"""Perfect dress! This dress is v…",1
11152,232,60,"""Perfect dress""","""This is the most perfect dress…",0,"""Perfect dress This is the most…",1
11153,232,62,"""Really great!""","""This is a beautiful dress! not…",5,"""Really great! This is a beauti…",0


In [9]:
import json

with open("../data/test_index_spelling_errors_cnt_pyspell.json", "w") as f:
    json.dump(dict(zip(test["index"], test["spelling_errors_cnt"])), f)