# Dealing with text

In general, deep learning models need numeric input. The bulk of the digital information in the world, however, is in text format. So, we need a way to convert from text to numbers.

The usual sequence for this is as follows:
 - standardization
 - tokenization
 - indexing
 - encoding/embedding

<img src="text_process.png" width=600 align="center">

(image source: *Deep Learning with Python, 2nd edition, F. Chollet*)

## Standardization

### Punctuation

In [None]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
string_with_punc = "This string, and by no means is it the only one, has some punctuation."

print(f"Remove puncuation:\n{str([c for c in string_with_punc if c not in string.punctuation])}\n")

print(f"Reform sentence by joining:\n{''.join(c for c in string_with_punc if c not in string.punctuation)}")

Remove puncuation:
['T', 'h', 'i', 's', ' ', 's', 't', 'r', 'i', 'n', 'g', ' ', 'a', 'n', 'd', ' ', 'b', 'y', ' ', 'n', 'o', ' ', 'm', 'e', 'a', 'n', 's', ' ', 'i', 's', ' ', 'i', 't', ' ', 't', 'h', 'e', ' ', 'o', 'n', 'l', 'y', ' ', 'o', 'n', 'e', ' ', 'h', 'a', 's', ' ', 's', 'o', 'm', 'e', ' ', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n']

Reform sentence by joining:
This string and by no means is it the only one has some punctuation


### Lower case

In [None]:
string_with_upper = "THIs iS A meSSeD uP SeNTenCE"

string_lower = string_with_upper.lower()
string_lower

'this is a messed up sentence'

### Special characters

In [None]:
!pip install nltk



In [None]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
import unidecode

accent = u'México'

no_accent = unidecode.unidecode(accent)
no_accent

ModuleNotFoundError: ignored

### Stemming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

words = ["study", "studies", "studying", "studied"]

for w in words:
    print(f"Original word:  {w}, after stemming:  {stemmer.stem(w)}")

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

for w in words:
    print(f"Original word: {w}, after lemmatization: {lemmatizer.lemmatize(w, pos='v')}")

## Tokenization

### Character-level

In [None]:
str(list(string_lower)) # str() is only used here to get the list to print nicely across the screen

### Word-level

In [None]:
string_lower.split()

### N-grams

In [None]:
# !pip install nltk  # uncomment and install nltk if needed

In [None]:
from nltk import ngrams

for bigram in ngrams(string_lower.split(), 2):
    print(bigram)

## Vocabulary indexing

In [None]:
vocabulary = {}

text = string_lower.split()

for token in text:
    if token not in vocabulary:
        vocabulary[token] = len(vocabulary)

In [None]:
vocabulary

{'a': 2, 'is': 1, 'messed': 3, 'sentence': 5, 'this': 0, 'up': 4}

The vocabulary is created using the training data, so one thing that can occur is that there will be words in your test data that are not in your vocabulary. We will assign a vocabulary entry of {"[UNK]": 0} to these "unknown" words.
Thus, we can adjust our code above to the following:

In [None]:
vocabulary = {"[UNK]": 0}

text = string_lower.split()

for token in text:
    if token not in vocabulary:
        vocabulary[token] = len(vocabulary)

vocabulary

{'[UNK]': 0, 'a': 3, 'is': 2, 'messed': 4, 'sentence': 6, 'this': 1, 'up': 5}

You can now encode sentences using this vocabulary.

In [None]:
test_sentence = "This sentence is not messed up."
test_sentence = test_sentence.lower()
test_sentence = "".join(char for char in test_sentence if char not in string.punctuation)
test_sentence = test_sentence.split()
test_sentence

['this', 'sentence', 'is', 'not', 'messed', 'up']

In [None]:
encoded = [vocabulary.get(token, 0) for token in test_sentence]
encoded

[1, 6, 2, 0, 4, 5]

We will also want the ability to decode text that has been encoded. To do this, we create an inverse vocabulary:

In [None]:
inverse_vocabulary = {}

for k, v in vocabulary.items():
    inverse_vocabulary[v] = k

inverse_vocabulary

{0: '[UNK]', 1: 'this', 2: 'is', 3: 'a', 4: 'messed', 5: 'up', 6: 'sentence'}

Now if we have encoded text, we can get the original text:

In [None]:
test_inv = [2, 3, 4, 5]

orig_sentence = " ".join((inverse_vocabulary.get(i, "[UNK]") for i in test_inv))

orig_sentence

'is a messed up'

## Putting it all together

In [None]:
# TO DO: complete the make_vocabulary function

import string

class Vectorizer:
    def standardize(self, input_text):
        text = input_text.lower()
        return "".join(char for char in text if char not in string.punctuation)

    def tokenize(self, input_text):
        text = self.standardize(input_text)
        return text.split()

    def make_vocabulary(self, input_text):
        self.vocabulary = {"[UNK]": 0}
        for text in input_text:
            text = self.standardize(input_text)
            tokens = self.tokenize(input_text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())


    def encode(self, input_text):
        text = self.standardize(input_text)
        tokens = self.tokenize(input_text)
        return [self.vocabulary.get(token, 0) for token in tokens]

    def decode(self, int_sequence):
        return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

Create a sentence and test out all the methods of this vectorizer class.

In [None]:
v1 = Vectorizer()

input_text = "THIs iS A meSSeD, uP SeNTenCE!."

text = v1.standardize(input_text)
print(text)

tokens = v1.tokenize(input_text)
print(tokens)

inv_vocab = v1.make_vocabulary(input_text)
print(inv_vocab)

encode = v1.encode(input_text)
print(encode)

int_sequence = [2,4,6]

decode = v1.decode(int_sequence)
print(decode)


this is a messed up sentence
['this', 'is', 'a', 'messed', 'up', 'sentence']
None
[1, 2, 3, 4, 5, 6]
is messed sentence


### One-hot encoding

Now create a function that will one-hot encode each token of a sentence, once a vocabulary has been created.

In [None]:
input_text = 'The cat is on the car!'

vectorizer = Vectorizer()

vectorizer.make_vocabulary(input_text)


vocabulary = vectorizer.vocabulary
vocabulary

{'[UNK]': 0, 'the': 1, 'cat': 2, 'is': 3, 'on': 4, 'car': 5}

In [None]:
# TO DO: complete the function

import numpy as np

def one_hot_encode_token(token):
    vector = np.zeros(len(vocabulary),)
    vector[token] = 1
    return vector


In [None]:
int_vec = v1.encode('The cat is on the car!')
int_vec

[0, 0, 2, 0, 0, 0]

In [None]:
one_hot_encoded = [one_hot_encode_token(token) for token in int_vec]
one_hot_encoded

[array([1., 0., 0., 0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0.]),
 array([0., 0., 1., 0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0.]),
 array([1., 0., 0., 0., 0., 0.])]

## Using Keras

Using the [TextVectorization](https://keras.io/api/layers/preprocessing_layers/text/text_vectorization/) layer in Keras:

In [None]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization(output_mode="int")

In [None]:
sentence = "The cat sat on the mat"

text_vectorization.adapt([sentence])

In [None]:
text_vectorization.get_vocabulary()

['', '[UNK]', 'the', 'sat', 'on', 'mat', 'cat']

In [None]:
new_sentence = "The cat did not sit on the mat"

text_vectorization([new_sentence]).numpy()

array([[2, 6, 1, 1, 1, 4, 2, 5]], dtype=int64)

### N-grams

In [None]:
from tensorflow.keras.layers import TextVectorization

text_vectorization_bigram = TextVectorization(ngrams=2, output_mode="int")

In [None]:
sentence = "The cat sat on the mat"

text_vectorization_bigram.adapt([sentence])

In [None]:
text_vectorization_bigram.get_vocabulary()

['',
 '[UNK]',
 'the',
 'the mat',
 'the cat',
 'sat on',
 'sat',
 'on the',
 'on',
 'mat',
 'cat sat',
 'cat']

In [None]:
new_sentence = "The cat did not sit on the mat"

text_vectorization_bigram([new_sentence]).numpy()

array([[ 2, 11,  1,  1,  1,  8,  2,  9,  4,  1,  1,  1,  1,  7,  3]],
      dtype=int64)

### One-hot encoding

In [None]:
import tensorflow as tf

vectorized_text = text_vectorization(["The cat did not sit on the mat"])
vectorized_text

one_hot = tf.one_hot(vectorized_text, depth=8)
one_hot.numpy()

array([[[0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0.]]], dtype=float32)

### Learning embeddings

In [None]:
vectorized_text = text_vectorization(["The cat did not sit on the mat"])
vectorized_text.numpy()

array([[2, 6, 1, 1, 1, 4, 2, 5]], dtype=int64)

In [None]:
from tensorflow.keras.layers import Embedding

embedded = Embedding(input_dim=8, output_dim=4)(vectorized_text)
embedded.numpy()

array([[[-0.01100979,  0.02564209,  0.04117367,  0.04404965],
        [-0.02440178, -0.03419872, -0.04710355, -0.02102966],
        [ 0.04030227,  0.01847314,  0.02640769,  0.01122208],
        [ 0.04030227,  0.01847314,  0.02640769,  0.01122208],
        [ 0.04030227,  0.01847314,  0.02640769,  0.01122208],
        [-0.04717664, -0.01785877, -0.03373032,  0.0059191 ],
        [-0.01100979,  0.02564209,  0.04117367,  0.04404965],
        [ 0.00040773, -0.04087483, -0.01782596,  0.0468341 ]]],
      dtype=float32)

### Pretrained embeddings

In [None]:
import pandas as pd

In [None]:
glove = []

with open("glove.6B.50d.txt") as file:
    i = 0
    for line in file:
        glove.append(line.rstrip())
        i += 1
        if i > 20: break

In [None]:
glove_dict ={}

for word in glove:
    vec = word.split()
    glove_dict[vec[0]] = vec[1:]

In [None]:
glove_df = pd.DataFrame(data=glove_dict).transpose()
glove_df.sort_index(axis=0)[5:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
a,0.21705,0.46515,-0.46757,0.10082,1.0135,0.74845,-0.53104,-0.26256,0.16812,0.13182,...,0.13813,0.36973,-0.64289,0.024142,-0.039315,-0.26037,0.12017,-0.043782,0.41013,0.1796
and,0.26818,0.14346,-0.27877,0.016257,0.11384,0.69923,-0.51332,-0.47368,-0.33075,-0.13834,...,-0.069043,0.36885,0.25168,-0.24517,0.25381,0.1367,-0.31178,-0.6321,-0.25028,-0.38097
as,0.20782,0.12713,-0.30188,-0.23125,0.30175,0.33194,-0.52776,-0.44042,-0.48348,0.03502,...,-0.15768,0.39606,-0.23646,-0.095054,0.07859,-0.012305,-0.49879,-0.35301,0.05058,0.019495
for,0.15272,0.36181,-0.22168,0.066051,0.13029,0.37075,-0.75874,-0.44722,0.22563,0.10208,...,0.020339,0.2142,0.044097,0.14003,-0.20079,0.074794,-0.36076,0.43382,-0.084617,0.1214
he,-0.20092,-0.060271,-0.61766,-0.8444,0.5781,0.14671,-0.86098,0.6705,-0.86556,-0.18234,...,-0.16925,0.10228,-0.62143,0.19829,-0.36147,-0.24769,-0.38989,-0.33317,-0.041659,-0.013171
in,0.33042,0.24995,-0.60874,0.10923,0.036372,0.151,-0.55083,-0.074239,-0.092307,-0.32821,...,-0.48609,-0.0080272,0.031184,-0.36576,-0.42699,0.42164,-0.11666,-0.50703,-0.027273,-0.53285
is,0.6185,0.64254,-0.46552,0.3757,0.74838,0.53739,0.0022239,-0.60577,0.26408,0.11703,...,-0.016573,0.312,-0.33189,-0.026001,-0.38203,0.19403,-0.12466,-0.27557,0.30899,0.48497
it,0.61183,-0.22072,-0.10898,-0.052967,0.50804,0.34684,-0.33558,-0.19152,-0.035865,0.1051,...,0.050059,-0.10058,-0.017907,0.11142,-0.71798,0.491,-0.099974,-0.043688,-0.097922,0.16806
of,0.70853,0.57088,-0.4716,0.18048,0.54449,0.72603,0.18157,-0.52393,0.10381,-0.17566,...,-0.34727,0.28483,0.075693,-0.062178,-0.38988,0.22902,-0.21617,-0.22562,-0.093918,-0.80375
on,0.30045,0.25006,-0.16692,0.1923,0.026921,-0.079486,-0.91383,-0.1974,-0.053413,-0.40846,...,-0.089032,0.062001,-0.19946,-0.38863,-0.18232,0.060751,0.098603,-0.07131,0.23052,-0.51939


In [None]:
 class Vectorizer:
    def standardize(self, input_text):
        text = input_text.lower()
        return "".join(char for char in text if char not in string.punctuation)

    def tokenize(self, input_text):
        text = self.standardize(input_text)
        return text.split()

    def make_vocabulary(self, input_text):
        self.vocabulary = {"[UNK]": 0}
        for text in input_text:
            text = self.standardize(input_text)
            tokens = self.tokenize(input_text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())


    def encode(self, input_text):
        text = self.standardize(input_text)
        tokens = self.tokenize(input_text)
        return [self.vocabulary.get(token, 0) for token in tokens]

    def decode(self, int_sequence):
        return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

v1 = Vectorizer()
x = x.apply(lambda x : x.to_string())
vocabulary = x.apply(lambda x : v1.make_vocabulary(x))
encode = x.apply(lambda x : v1.encode(x))