# Loading Text Data using Keras Utility functions.

In [1]:
import tensorflow as tf
import numpy as np

import os
import pathlib

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

print('Tensorflow version: {}'.format(tf.__version__))

Tensorflow version: 2.4.1


## Downloading Dataset using Keras Utility


In [3]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'
cache_dir = '/work/DeepLearning/Notes/datasets'

dataset_dir = tf.keras.utils.get_file(
    origin=url,
    fname='stackoverflow_ds',
    cache_dir=cache_dir,
    cache_subdir='',
    extract=True
)

print('Size of the downloaded file: {} MB'.format(int(os.path.getsize(dataset_dir)/(1024*1024))))
os.remove(dataset_dir)
dataset_dir = pathlib.Path(dataset_dir).parent
print(os.listdir(dataset_dir))

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz
Size of the downloaded file: 5 MB
['README.md', 'test', 'train']


In [None]:
# Get the dataset directory and it's content.

ds_dir = os.getcwd()

print('Content of train directory: ')
train_dir = os.path.join(ds_dir + '/train')
print(os.listdir(train_dir))

print('Content of test directory: ')
test_dir = os.path.join(ds_dir + '/test')
print(os.listdir(test_dir))

Content of train directory: 
['javascript', 'java', 'csharp', 'python']
Content of test directory: 
['csharp', 'python', 'java', 'javascript']


In [None]:
# Let's read the content of a file.

fname = train_dir + '/python/1755.txt'
file = open(fname, "r")
print(file.read())

why does this blank program print true x=true.def stupid():.    x=false.stupid().print x



## Loading the off disk

We'll use keras utility function for this purpose. This function will return tf.data.Dataset. 
For more information, look at the "tf.data.Dataset" file.

In [None]:
# Load the dataset off disk.
# for the current purpose, we'll use tf.keras.utils.text_dataset_from_directory. this will return tf.data.Dataset object.
# This function can also be used to divide the dataset into train and test dataset.

batch_size = 32
seed = 42

# Create traing dataset
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [None]:
# Create validation dataset.
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [None]:
# Let's look into the trainig dataset. This is tf.data.Dataset object. Here we will learn to consume Dataset object.
# 1. As clear from the above, there are total 6400 files in training.

# 2. len(raw_train_ds) = 200.
#    6400/32 = 200. This shows that 6400 files are divided into batch of 32 files each.# 
# len(raw_train_ds)

# 3. Let's consume on element from this training dataset. Each element is tuple of length 2. First element contain
# text strings and second element contains labels. Let's now print first few sequences and there labels.

for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print('Question: {}'.format(text_batch.numpy()[i]))
        print('Label: {}\n'.format(label_batch.numpy()[i]))

Question: b'"blank8 why is my solution faster than the neat solution? (hackerrank chocolate feast) edit: simplified my solution..edit: removed opinion based secondary question...background: atarted learning blank a week or two ago using hackerranks problems as exercises and stackoverflow search + google as my teacher, i\'ve had some limited experience learning other languages...i did the exercise my own ""noobish learner way"" which i can\'t help but feel is a ""botched job"" when i see ""neat &amp; short"" solutions...however, when submitting both solutions one after another a couple of times i found the ""neat"" solution was quite a bit slower. ..i vaguely remember something about % operations being costly, is mine faster because of no % operations or is there more to it than just that?..exercise: https://www.hackerrank.com/challenges/chocolate-feast..neat solution from discussion:..import blank.io.*;.import blank.util.*;..public class solution {.    static int cc; .    public static

In [None]:
# Here labels are encoded as integers. Let's find pot which integer corressponds to which label.
for i, label in enumerate(raw_train_ds.class_names):
    print('Label ', i, 'corressponds to ', label)

Label  0 corressponds to  csharp
Label  1 corressponds to  java
Label  2 corressponds to  javascript
Label  3 corressponds to  python


In [None]:
# Let's create test dataset.

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    test_dir,
    batch_size=batch_size
)

Found 8000 files belonging to 4 classes.


## Let's Preprocess the Data


In [None]:
# Useful keywords - bag-of-words.
# Preprocessing of text data consists of following steps.
# 1. Standardization.
# 2. Tokenization.
# 3. Vectorization.

# We'll use two TextVectorization binary and int.
VOCAB_SIZE=10000

# Let's create binary TextVectorization.
binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary'
)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=aa728bc6-d481-4c01-a2d7-f40078d4b3b7' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>