In [22]:
import string

Building a word level tokenizer

https://medium.com/analytics-vidhya/tokenization-building-a-tokenizer-and-a-sentencizer-c19a00393c19

In [23]:
class Sentencizer:
    # split sentances at every period, question mark, exclamation point or colon
    def __init__(self, input_text, split_chars=['.', '?', '!', ':'], delimiter_token = '<SPLIT>'):
        # our output sentences after splitting
        self.sentences = []

        # raw text to split
        self.raw = str(input_text)

        self.split_chars = split_chars
        self.delimiter_token = delimiter_token
        self.index = 0
        self.sentencize()
    
    def sentencize(self):
        # copy raw data
        copy_sentence = self.raw

        # for every splitting character
        for char in self.split_chars:
            # replace the character with the <SPLIT> token
            copy_sentence = copy_sentence.replace(char, char+""+self.delimiter_token)
        # split the copied sentences by the <SPLIT> tokens and place into sentences 
        self.sentences = [x.strip() for x in copy_sentence.split(self.delimiter_token) if x != '']

    # iterator
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.index < len(self.sentences):
            result = self.tokens[self.index]
            self.index += 1
            return result
        raise StopIteration

In [24]:
# take in sentence and turn it into tokens
class Tokenizer:
    # split a sentence into tokens at every spaces and dashes
    def __init__(self, sentence, token_split=[' ', '-'], punctuations=string.punctuation, delimiter_token = '<SPLIT>'):
        # our output sentences after splitting
        self.tokens = []

        # raw text to split
        self.raw = str(sentence)

        self.token_split = token_split
        self.delimiter_token = delimiter_token

        # pre-process punctuation from words
        self.punctuations = punctuations
        self.index = 0
        self.tokenize()
    
    def tokenize(self):
        # copy raw data
        copy_sentence = self.raw

        # punctuation processing by separating with whitespace
        for punctuation in self.punctuations:
            # replace the character with the <SPLIT> token
            copy_sentence = copy_sentence.replace(punctuation, " "+punctuation+" ")

        for delimiter in self.token_split:
            # replace the character with the <SPLIT> token
            copy_sentence = copy_sentence.replace(delimiter, self.delimiter_token)
        # split the copied sentences by the <SPLIT> tokens and place into sentences 
        self.tokens = [x.strip() for x in copy_sentence.split(self.delimiter_token) if x != '']

    # iterator
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.index < len(self.tokens):
            result = self.tokens[self.index]
            self.index += 1
            return result
        raise StopIteration

In [25]:
sentence = Sentencizer(f"Gregor Samsa wakes up one morning to find himself transformed into a 'monstrous vermin'. He initially considers the transformation to be temporary and slowly ponders the consequences of this metamorphosis. Stuck on his back and unable to get up and leave the bed, Gregor reflects on his job as a traveling salesman and cloth merchant, which he characterizes as being full of 'temporary and constantly changing human relationships, which never come from the heart'. He sees his employer as a despot and would quickly quit his job if he were not his family's sole breadwinner and working off his bankrupt father's debts. While trying to move, Gregor finds that his office manager, the chief clerk, has shown up to check on him, indignant about Gregor's unexcused absence. Gregor attempts to communicate with both the manager and his family, but all they can hear from behind the door is incomprehensible vocalizations. Gregor laboriously drags himself across the floor and opens the door. The clerk, upon seeing the transformed Gregor, flees the apartment. Gregor's family is horrified, and his father drives him back into his room, injuring his side by shoving him when he gets stuck in the doorway.")
print(sentence.sentences)

for sentence in sentence.sentences:
    print(Tokenizer(sentence).tokens)

["Gregor Samsa wakes up one morning to find himself transformed into a 'monstrous vermin'.", 'He initially considers the transformation to be temporary and slowly ponders the consequences of this metamorphosis.', "Stuck on his back and unable to get up and leave the bed, Gregor reflects on his job as a traveling salesman and cloth merchant, which he characterizes as being full of 'temporary and constantly changing human relationships, which never come from the heart'.", "He sees his employer as a despot and would quickly quit his job if he were not his family's sole breadwinner and working off his bankrupt father's debts.", "While trying to move, Gregor finds that his office manager, the chief clerk, has shown up to check on him, indignant about Gregor's unexcused absence.", 'Gregor attempts to communicate with both the manager and his family, but all they can hear from behind the door is incomprehensible vocalizations.', 'Gregor laboriously drags himself across the floor and opens the