In [1]:
import emoji
from nltk.stem import PorterStemmer
from string import punctuation
from nltk.corpus import stopwords
import re
import ray
import modin.experimental.pandas as md
import pandas as pd
import sqlite3
import os
from sspipe import p, px
import warnings
warnings.filterwarnings("ignore")

Please note that some of these APIs deviate from pandas in order to provide improved performance.


Set up modin to utilize all cpu cores when tokenizing (since pandas library only uses one core)

In [None]:
os.environ['MODIN_ENGINE'] = 'ray'
ray.init(num_cpus=os.cpu_count())

In [2]:
db = sqlite3.connect('database.db')

Query the data that will be tokinized

In [8]:
review = md.read_sql('SELECT `business_id`, `review_id`, `date`, `stars`, `text`, `name` FROM resturants_review', db)
review.sample(5)

Unnamed: 0,business_id,review_id,date,stars,text,name
850381,KB-lnnyWsusCPTQ0KhYoaQ,50NigUjcwHSHRDsiT69vhA,2019-04-23 23:05:33,4,Sakura is always tasty! They do have a great ...,Sakura Japanese Restaurant
2912614,uM0ljy6pQIegLJtBUw9dOQ,qq1OvFlWvzHLBsjrDk470g,2014-01-26 01:28:57,4,Ever wonder what the world would look like if ...,Steel Wheels Pizzeria
1859344,EvjOPotlf9pXRXzVvN4GaA,OQBudWxPFK5v9vELTOGefg,2010-12-13 08:31:35,2,"acceptable food, bad order accuracy, they seem...",Caesar's Bistro
1916408,khH0QtNyUjcExh9i2CwGfg,gskrVpxfBFOc8PLnnuKAkA,2015-02-17 00:13:17,5,This place will be number one on the best of p...,Serpico
1707063,MzFhaFNbE03zF84BPkN7yQ,YP9ciaV5YPGXIKNIs2y7Kg,2011-12-22 17:40:34,4,I've been here a couple of times and have been...,Sang Kee Noodle House


In [None]:
class TokenCleaner():
    def __init__(self, remove_stopwords=True, return_as_string=True):

        # Some punctuation variations
        self.punctuation = set(punctuation)  # speeds up comparison
        self.punct_set = self.punctuation - {"#"}
        self.punct_pattern = \
            re.compile("[" + re.escape("".join(self.punct_set)) + "]")
        self.stemmer = PorterStemmer()
        self.ZERO_WIDTH_JOINER = '\u200d'

        # Stopwords
        if remove_stopwords:
            self.sw = stopwords.words("english") + ['️', '', ' ']
        else:
            self.sw = ''

        # Two useful regex
        self.whitespace_pattern = re.compile(r"\s+")
        self.hashtag_pattern = re.compile(r"^#[0-9a-zA-Z]+")
        self.CleanText_return_format = return_as_string

    def CleanText(self, _text):
        # if _text is has nothing in it then return none
        if _text is None:
            return ''

        # decode bytes to string if necessary
        if isinstance(_text, str):
            self.text = _text
        elif isinstance(_text, float):
            self.text = str(_text)
        else:
            # this is for the case of tweets which are saved as bytes
            self.text = _text.decode("utf-8")

        self.__add_space_before_and_after_emoji()
        self.__RemovePunctuation()
        self.__TokenizeText()
        self.__StemEachToken()
        self.__RemoveStopWords()

        if self.CleanText_return_format:
            return ' '.join(self.tokens)
        else:
            return self.tokens

    def __StemEachToken(self):
        """
        Perform Stemming on each token (i.e. working, worked, works are all converted to work)<
        """

        self.tokens = [self.stemmer.stem(token) for token in self.tokens]

    def __add_space_before_and_after_emoji(self):
        text_section = list()
        for i, char in enumerate(self.text):
            if emoji.is_emoji(char):
                text_section.append(' ' + self.text[i] + ' ')
            else:
                text_section.append(self.text[i])

            if self.ZERO_WIDTH_JOINER in text_section:
                text_section.remove(self.ZERO_WIDTH_JOINER)

        return ''.join(text_section)

    def __RemovePunctuation(self):
        """
        Loop through the original text and check each character,
        if the character is a punctuation, then it is removed.
        ---------------------------------------------------------
        input: original text
        output: text without punctuation
        """
        self.text = \
            "".join([ch for ch in self.text if ch not in self.punct_set])

        self.text = re.sub(self.punct_pattern, '', self.text)

    def __TokenizeText(self):
        """
        Tokenize by splitting the text by white space
        ---------------------------------------------------------
        input: text without punctuation
        output: A list of tokens
        """
        self.tokens = \
            [item for item in self.whitespace_pattern.split(self.text)]

    def __RemoveStopWords(self):
        """
        Tokenize by splitting the text by white space
        ---------------------------------------------------------
        input: text without punctuation
        output: A list of tokens with all token as lower case
        """
        self.tokens = [token.lower() for token in self.tokens]

        self.tokens = \
            [token for token in self.tokens if not token in self.sw]


def add_space_after_emoji(text):

    text_section = list()
    for i, char in enumerate(text):
        if emoji.is_emoji(char):
            text_section.append(' ' + text[i] + ' ')
        else:
            text_section.append(text[i])

        if self.ZERO_WIDTH_JOINER in text_section:
            text_section.remove(self.ZERO_WIDTH_JOINER)

    return ''.join(text_section)


def clean_string(text):
    if pd.isnull(text):
        return text

    remove_words = stopwords.words("english") + ['️', '', ' ']
    text = text.replace('|', ' ').replace('\n', ' ')

    text = re.sub(punct_pattern, '', text)
    text = add_space_after_emoji(text)
    text_tokens = text.split(' ')
    text = [word.lower() for word in text_tokens]
    text = [word for word in text if not word in remove_words]
    return text

In [None]:
tc = TokenCleaner()

review['tokenized'] = review['text'].apply(tc.CleanText)
review.sample(5)

In [None]:
review._to_pandas().to_sql('resturants_review', db, if_exists='replace', index=False)
db.commit()
db.close()