# Strategy for text treatment

**Language detection** - for now the default language is English and we switcht to French if cld3 detects it. Should we consider other languages too?

**Remove punctuation and special characters**

**Tokenization**

**Stop-word removal** - stop-word removal is language-based and is done before the stemming, otherwise they might not be detected

**Stemming** - stemming is performed in favor for lemmatization, as we're going to be working mainly with names and not even entire sentences. Since lemmatizing depends on the sentence context, it would not be a good option here.

**ASCII folding**
As we want to make the text prep functions more flexible and allow the passing of some arguments such as custom blacklist of words or custom regex replaces, the new approach is to curry the function with those parameters and then apply it on a Spark DataFrame column.

In [18]:
# Snowball stemmer was chosen in favor of Porter Stemmer which is a bit more aggressive and tends to remove too much from a word
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
 
# unidecode is the library needed for ASCII folding
from unidecode import unidecode
import string
# Compact Language Detector v3 is a very fast and performant algorithm by Google for language detection
import nltk
import re
import pyspark.sql.functions as F
from typing import List, Dict, Optional, Callable
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\Salif
[nltk_data]     SAWADOGO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Salif
[nltk_data]     SAWADOGO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
%run ./function_tools.ipynb

In [19]:
@F.pandas_udf('string')
@function_vectorizer
def get_arabic(text_input: str) -> Optional[str]:
    """This function parses a string an returns only the arabic characters based
    on their unicode values.
 
    Arguments:
        text_input {str} -- The string to be parsed.
 
    Returns:
        Optional[str] -- The arabic characters discovered or None if the input was empty.
    """
    k = ''
    if text_input is None:
      return None
    for ch in text_input:
      if ('\u0600' <= ch <= '\u06FF' or
        '\u0750' <= ch <= '\u077F' or
        '\u08A0' <= ch <= '\u08FF' or
        '\uFB50' <= ch <= '\uFDFF' or
        '\uFE70' <= ch <= '\uFEFF' or
        '\U00010E60' <= ch <= '\U00010E7F' or
        '\U0001EE00' <= ch <= '\U0001EEFF' or
         ch == " "):
        k = k + ch

In [20]:
@F.pandas_udf('string')
@function_vectorizer
def get_latin(text_input: str) -> Optional[str] :
    """This function parses a string an returns only the latin characters based
    on their unicode values.
 
    Arguments:
        text_input {str} -- The string to be parsed.
 
    Returns:
        Optional[str] -- The latin characters discovered or None if the input was empty.
    """
    k = ''
    if text_input is None:
      return None
    for ch in text_input:
      if ('\u0000' <= ch <= '\u007F' or
          '\u0080' <= ch <= '\u00FF' or
          '\u0100' <= ch <= '\u017F' or
          '\u0180' <= ch <= '\u024F' or
          '\u1E00' <= ch <= '\u1EFF' or
          ch == " "):
        k = k + ch
    return k.strip()



In [21]:
@F.pandas_udf('string')
@function_vectorizer
def translate_text(text_input: str, language: str = "en",
    subscription_key: str = "2fe8ca9b5c5b459982d622eb088f831a") -> str:
    """Make a call to Microsoft's Text Translation API in order to translate
    the text input. 
    
    Arguments:
        text_input {str} -- The text to be translated
        
    Keyword Arguments:
        language {str} -- The language code for the translation output.
        subscription_key {str} -- The subscription key for the service.
    
    Returns:
        str -- The translation of input_text into the language specified
        in 'langugage'.
    """
    
    if text_input is None or text_input == "":
      return text_input
    
    base_url = 'https://api-nam.cognitive.microsofttranslator.com'
    path = '/translate?api-version=3.0'
    params = '&to=' + language
    constructed_url = base_url + path + params
 
    headers = {
        'Ocp-Apim-Subscription-Key': subscription_key,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())
    }
 
    # You can pass more than one object in body.
    body = [{
        'text' : text_input
    }]
    response = requests.post(constructed_url, headers=headers, json=body)
    return response.json()[0]['translations'][0]['text']


In [22]:
def make_text_prep_func(word_blacklist: List[str] = [],
                        regex_replace: Dict[str, str] = {}
                       ) -> Callable:
  @F.pandas_udf("string")
  @function_vectorizer
  def text_prep_sdf(s: str) -> str:
    """The Spark DataFrame version of the function does essentially
    the same. However, in order for the function to work, all the downloads
    need to be done in its body, otherwise the Spark cluster would have
    no access to the needed component. The download would be done only once
    anyways.
    """
    try:
      STOPWORDS_EN = stopwords.words("english")
      STOPWORDS_FR = stopwords.words("french")
      STEMMER_EN = SnowballStemmer(language='english')
      STEMMER_FR = SnowballStemmer(language='french')
    except:
      nltk.download("punkt")
      nltk.download("stopwords")
      STOPWORDS_EN = stopwords.words("english")
      STOPWORDS_FR = stopwords.words("french")
      STEMMER_EN = SnowballStemmer(language='english')
      STEMMER_FR = SnowballStemmer(language='french')
 
    if s is None or s=="":
      return ""
#     STOPWORDS_EN = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
#     STOPWORDS_FR = ['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 'auront', 'aurais', 'aurait', 'aurions', 'auriez', 'auraient', 'avais', 'avait', 'avions', 'aviez', 'avaient', 'eut', 'eûmes', 'eûtes', 'eurent', 'aie', 'aies', 'ait', 'ayons', 'ayez', 'aient', 'eusse', 'eusses', 'eût', 'eussions', 'eussiez', 'eussent']
    stop_words = STOPWORDS_EN + word_blacklist
    stemmer = STEMMER_EN
 
    s = s.lower()
 
    # check if the language is French
    s_lang = detect(s)
    if s_lang=="fr":
      stop_words = STOPWORDS_FR + word_blacklist
      stemmer = STEMMER_FR
 
    s_clean = s.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    s_tokens = word_tokenize(s_clean)
    s_tokens_no_stop = [word for word in s_tokens if word not in stop_words]
    s_tokens_stemmed = [stemmer.stem(word) for word in s_tokens_no_stop]
    s_ascii = unidecode(" ".join(s_tokens_stemmed))
    
    for regex, replace in regex_replace.items():
      s_ascii = re.sub(regex, replace, s_ascii)
    return(s_ascii.strip())
  
  return text_prep_sdf

@F.pandas_udf("string")
@function_vectorizer
def ta_remove_address_tail(address: str) -> str:
  if address is None:
    return None
  address_split = address.split(",")
  if len(address_split) <= 1:
    return address
  return ", ".join(address_split[:-1])
