# Loading Text Data using Keras Utility functions.

In [20]:
import tensorflow as tf
import numpy as np

import os
import pathlib

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

print('Tensorflow version: {}'.format(tf.__version__))

Tensorflow version: 2.4.1


## Downloading Dataset using Keras Utility


In [2]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
cache_dir = '/work/DeepLearning/Notes/datasets'

dataset_dir = tf.keras.utils.get_file(
    origin=url,
    fname='stackoverflow_ds',
    cache_dir=cache_dir,
    cache_subdir='',
    extract=True
)

print('Size of the downloaded file: {} MB'.format(int(os.path.getsize(dataset_dir)/(1024*1024))))
os.remove(dataset_dir)
dataset_dir = pathlib.Path(dataset_dir).parent
print(os.listdir(dataset_dir))

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz
Size of the downloaded file: 5 MB
['README.md', 'test', 'train']


In [3]:
# Get the dataset directory and it's content.

ds_dir = dataset_dir

print('Content of train directory: ')
train_dir = os.path.join(ds_dir, 'train')
print(os.listdir(train_dir))

print('Content of test directory: ')
test_dir = os.path.join(ds_dir, 'test')
print(os.listdir(test_dir))

Content of train directory: 
['java', 'python', 'csharp', 'javascript']
Content of test directory: 
['javascript', 'java', 'csharp', 'python']


In [4]:
# Let's read the content of a file.

fname = train_dir + '/python/1755.txt'
file = open(fname, "r")
print(file.read())

why does this blank program print true x=true.def stupid():.    x=false.stupid().print x



## Loading the off disk

We'll use keras utility function for this purpose. This function will return tf.data.Dataset. 
For more information, look at the "tf.data.Dataset" file.

In [5]:
# Load the dataset off disk.
# for the current purpose, we'll use tf.keras.utils.text_dataset_from_directory. this will return tf.data.Dataset object.
# This function can also be used to divide the dataset into train and test dataset.

batch_size = 32
seed = 42

# Create traing dataset
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [6]:
# Create validation dataset.
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [7]:
# Let's look into the trainig dataset. This is tf.data.Dataset object. Here we will learn to consume Dataset object.
# 1. As clear from the above, there are total 6400 files in training.

# 2. len(raw_train_ds) = 200.
#    6400/32 = 200. This shows that 6400 files are divided into batch of 32 files each.# 
# len(raw_train_ds)

# 3. Let's consume on element from this training dataset. Each element is tuple of length 2. First element contain
# text strings and second element contains labels. Let's now print first few sequences and there labels.

for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print('Question: {}'.format(text_batch.numpy()[i]))
        print('Label: {}\n'.format(label_batch.numpy()[i]))

Question: b'"my tester is going to the wrong constructor i am new to programming so if i ask a question that can be easily fixed, please forgive me. my program has a tester class with a main. when i send that to my regularpolygon class, it sends it to the wrong constructor. i have two constructors. 1 without perameters..public regularpolygon().    {.       mynumsides = 5;.       mysidelength = 30;.    }//end default constructor...and my second, with perameters. ..public regularpolygon(int numsides, double sidelength).    {.        mynumsides = numsides;.        mysidelength = sidelength;.    }// end constructor...in my tester class i have these two lines:..regularpolygon shape = new regularpolygon(numsides, sidelength);.        shape.menu();...numsides and sidelength were declared and initialized earlier in the testing class...so what i want to happen, is the tester class sends numsides and sidelength to the second constructor and use it in that class. but it only uses the default cons

In [8]:
# Here labels are encoded as integers. Let's find pot which integer corressponds to which label.
for i, label in enumerate(raw_train_ds.class_names):
    print('Label ', i, 'corressponds to ', label)

Label  0 corressponds to  csharp
Label  1 corressponds to  java
Label  2 corressponds to  javascript
Label  3 corressponds to  python


In [9]:
# Let's create test dataset.

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir,
    batch_size=batch_size
)

Found 8000 files belonging to 4 classes.


## Let's Preprocess the Data

TextVectorization ref - https://www.tensorflow.org/versions/r2.4/api_docs/python/tf/keras/layers/experimental/preprocessing/TextVectorization

In [10]:
# Useful keywords - bag-of-words.
# Preprocessing of text data consists of following steps.
# 1. Standardization.
# 2. Tokenization.
# 3. Vectorization.

# We'll use two TextVectorization binary and int.
VOCAB_SIZE=10000

# Let's create binary TextVectorization.
binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary'
)

In [21]:
# Let's create a int vectorization.

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=250)

In [22]:
# Let's adapt this layer to the dataset. This requires text only dataset. 
# Currently out datasets is tuple with one element as string and other element label.
# First we need to get the text only dataset.

train_text = raw_train_ds.map(lambda text, labels: text)
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [13]:
# Let's explore what happened in the above step.
# First print out the question and label we want to process.

text_batch, label_batch = next(iter(raw_train_ds))
question = text_batch[0]
label = label_batch[0]

In [23]:
print('Question: ', question)
print('Label: ', label)

Question:  tf.Tensor(b'"function expected error in blank for dynamically created check box when it is clicked i want to grab the attribute value.it is working in ie 8,9,10 but not working in ie 11,chrome shows function expected error..&lt;input type=checkbox checked=\'checked\' id=\'symptomfailurecodeid\' tabindex=\'54\' style=\'cursor:pointer;\' onclick=chkclickevt(this);  failurecodeid=""1"" &gt;...function chkclickevt(obj) { .    alert(obj.attributes(""failurecodeid""));.}"\n', shape=(), dtype=string)
Label:  tf.Tensor(2, shape=(), dtype=int32)


In [27]:
# Let's implement TextVectorization.

def binary_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return binary_vectorize_layer(text), label

def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

In [25]:
# Now process the above question by just created TextVectorization layer.
binary_vectorize_text(question, label)[0]

<tf.Tensor: shape=(1, 10000), dtype=float32, numpy=array([[1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>

In [28]:
int_vectorize_text(question, label)[0]

<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[  38,  450,   65,    7,   16,   12,  892,  265,  186,  451,   44,
          11,    6,  685,    3,   46,    4, 2062,    2,  485,    1,    6,
         158,    7,  479,    1,   26,   20,  158,    7,  479,    1,  502,
          38,  450,    1, 1767, 1763,    1,    1,    1,    1,    1,    1,
           1,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   

In [33]:
# Let's explore the vocabulary of int vectoriation layer. Print out what those number means. 

print('Here are few indexes and their corressponding tokens: ')
print('38 >> ', int_vectorize_layer.get_vocabulary()[38])
print('450 >>', int_vectorize_layer.get_vocabulary()[450])
print('65 >> ', int_vectorize_layer.get_vocabulary()[65])
print('7 >> ', int_vectorize_layer.get_vocabulary()[7])
print('16 >> ', int_vectorize_layer.get_vocabulary()[16])

Here are few indexes and their corressponding tokens: 
38 >>  function
450 >> expected
65 >>  error
7 >>  in
16 >>  blank


In [34]:
# Let's vectorize the datasets.
binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_val_ds = raw_val_ds.map(binary_vectorize_text)
binary_test_ds = raw_test_ds.map(binary_vectorize_text)

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

## Configure dataset for performance

More info at: https://www.tensorflow.org/guide/data_performance

In [35]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [37]:
binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

## Train Model

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=aa728bc6-d481-4c01-a2d7-f40078d4b3b7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>