# STARTER CODE: Tokenizing Text Dataset for Modeling

---

## Setting Up

### Import Libraries

In [None]:
import numpy as np
from tqdm.notebook import tqdm
import tensorflow as tf

### Print Directory Items

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Read in Data

In [None]:
import pandas as pd
data = pd.read_csv('/kaggle/input/political-though-work-corpus/all-data.csv')
data = data[data['Text'].apply(lambda x:isinstance(x, str))==True]
data.head(3)

---

## Vectorize Data

This script collects a list of texts and converts them to a padded, tokenized TensorFlow dataset. Because almost all the string-level operations are performed within `tf.strings`, the process takes very little time to process large quantities of text (about two-thirds of a minute).

In [None]:
import time
start = time.time()

'''
====================================================================================
START OF RELEVANT TOKENIZATION SCRIPT
====================================================================================
'''

# importing necessary function
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# collect training data
train_data = data['Text'].tolist()

# quickly count number of unique words
complete_text = tf.strings.join([tf.constant(text) for text in data['Text']])
y, idx, count = tf.unique_with_counts(tf.strings.split(complete_text))

# set important parameters
num_words = y.shape[0]
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

# define and fit tokenizer
tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
tokenizer.fit_on_texts(train_data)
train_sequences = tokenizer.texts_to_sequences(train_data)

# pad sequences
maxlen = max([len(x) for x in train_sequences])
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
train_padded = tf.constant(train_padded)

# create tensorflow dataset
data = tf.data.Dataset.from_tensor_slices(train_padded)

'''
====================================================================================
END OF RELEVANT TOKENIZATION SCRIPT
====================================================================================
'''

end = time.time()
print(f'Took {round(end-start,3)} seconds.')

You can 'detokenize' a vectorization by passing it through `tokenizer.sequences_to_texts`.

In [None]:
decoded_string = tokenizer.sequences_to_texts(train_padded.numpy()[0:1])[0]
decoded_string[:1000]