# Deep learning for text

In [3]:
import string

In [27]:
class Vectorizer:
    
    # Removing punctuation, convert to lowercase
    def standardize(self, text):
        
        text = text.lower()
        
        return "".join(char for char in text if char not in string.punctuation)

    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()

    def make_vocabulary(self, dataset):
        
        self.vocabulary = {"": 0, "[UNK]": 1}
        
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        print('Before Reversing: ', self.vocabulary)
        self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())
        print('After Reversing: ', self.inverse_vocabulary)

    def encode(self, text):
        
        text = self.standardize(text)
        tokens = self.tokenize(text)
        
        return [self.vocabulary.get(token, 1) for token in tokens]

    def decode(self, int_sequence):
        
        return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

In [28]:
vectorizer = Vectorizer()

dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]

vectorizer.make_vocabulary(dataset)

Before Reversing:  {'': 0, '[UNK]': 1, 'i': 2, 'write': 3, 'erase': 4, 'rewrite': 5, 'again': 6, 'and': 7, 'then': 8, 'a': 9, 'poppy': 10, 'blooms': 11}
After Reversing:  {0: '', 1: '[UNK]', 2: 'i', 3: 'write', 4: 'erase', 5: 'rewrite', 6: 'again', 7: 'and', 8: 'then', 9: 'a', 10: 'poppy', 11: 'blooms'}


In [29]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [7]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again
