In [1]:
import pandas as pd
 # Snowball stemmer was chosen in favor of Porter Stemmer which is a bit more aggressive and tends to remove too much from a word
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
 
# unidecode is the library needed for ASCII folding
from unidecode import unidecode
import string
# Compact Language Detector v3 is a very fast and performant algorithm 
from langdetect import detect

In [None]:
%run ./function_tools.ipynb

In [None]:
# download nltk components and define language specific stop-words and stemmers for the regular version of the function
nltk.download("punkt")
nltk.download("stopwords")
STOPWORDS_EN = stopwords.words("english")
STOPWORDS_FR = stopwords.words("french")
STEMMER_EN = SnowballStemmer(language='english')
STEMMER_FR = SnowballStemmer(language='french')

In [None]:
def text_prep(s: str) -> str:
  """This function treats the input string by going through the following steps:
    1. Language detection
    2. Remove punctuation and special characters
    3. Tekenization
    4. Stop-word removal
    5. Stemming
    6. ASCII folding
  
  Arguments:
    s {str} -- The input string to be treated.
    
  Returns:
    str -- The treated version of the string. 
  """
  
  # return empyt string if no meaningful input is given
  if s is None or s=="":
    return ""
  
  # in the default case use the English stop-words and stemmer
  stop_words = STOPWORDS_EN
  stemmer = STEMMER_EN
  
  # convert to lowercase, just to be sure :)
  s = s.lower()
  
  # check if the language is French and switch to the French
  # stop-words and stemmer if that is the case
  s_lang = detect(s)
  if s_lang=="fr":
    stop_words = STOPWORDS_FR
    stemmer = STEMMER_FR
  
  # remove punctuation
  s_clean = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
  # tokenize the string into words
  s_tokens = word_tokenize(s_clean)
  # remove the stop-word tokens
  s_tokens_no_stop = [word for word in s_tokens if word not in stop_words]
  # stem the remaining ones
  s_tokens_stemmed = [stemmer.stem(word) for word in s_tokens_no_stop]
  # join the stemmed tokens together and ASCII fold
  s_ascii = unidecode(" ".join(s_tokens_stemmed))
  
  return(s_ascii)