In [1]:
!pip install pyspellchecker



In [2]:
import polars as pl
import spacy

In [3]:
DATA_PATH = "../data"

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
train = pl.read_csv(f"{DATA_PATH}/train_with_index.csv").with_columns(
    (pl.concat_str(["Title", "Review Text"], separator=" ")).alias("full_text")
)

In [6]:
from spellchecker import SpellChecker
import re

spellchecker = SpellChecker()
NUM_SYMBOL_PARTTERN = re.compile(r"^[0-9,./]+$")


def _count_spelling_errors(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]
    misspelled = spellchecker.unknown(lemmatized_tokens)

    misspelled = [
        text
        for text in misspelled
        if (text not in ["\n\n", "'s", "’s"])
        and (not text.isspace())
        and not NUM_SYMBOL_PARTTERN.match(text)
    ]

    return len(misspelled)


def count_spelling_errors() -> pl.Expr:
    return (
        pl.col("full_text")
        .map_elements(lambda x: _count_spelling_errors(x), return_dtype=pl.Int64)
        .alias("spelling_errors_cnt")
    )


train = train.with_columns(count_spelling_errors())

In [7]:
train = train.with_columns(pl.col("spelling_errors_cnt").fill_null(0))

In [8]:
train

index,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,full_text,spelling_errors_cnt
i64,i64,i64,str,str,i64,i64,i64,str,i64
0,0,25,"""3-season skirt!""","""Adorable, well-made skirt! lin…",5,1,4,"""3-season skirt! Adorable, well…",0
1,0,39,"""Very cute""","""Love the asymmetrical hem. wai…",5,1,0,"""Very cute Love the asymmetrica…",2
2,0,42,"""Beautiful! fruns small for typ…","""I love this skirt! i wasn't su…",5,1,5,"""Beautiful! fruns small for typ…",2
3,0,45,,"""I was really pleased with this…",5,1,9,,0
4,0,57,"""Unique, pretty asymmetric skir…","""I saw this skirt in retailer s…",5,1,1,"""Unique, pretty asymmetric skir…",3
…,…,…,…,…,…,…,…,…,…
9995,232,57,"""Runs big on top""",,3,1,5,,0
9996,232,58,,"""I loved the dress, but just no…",1,1,5,,0
9997,232,60,"""I was really disappointed""","""I was really hoping this dress…",2,0,7,"""I was really disappointed I wa…",0
9998,232,62,"""Too heavy""","""The design is beautiful but it…",2,0,0,"""Too heavy The design is beauti…",0


In [9]:
import json

with open("../data/train_index_spelling_errors_cnt_pyspell.json", "w") as f:
    json.dump(dict(zip(train["index"], train["spelling_errors_cnt"])), f)