<a href="https://colab.research.google.com/github/soohyunme/TensorFlow_Tutorial/blob/main/Code/18_Customdata_text/1_customdata_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers
import pickle

IMDB Dataset  
https://www.kaggle.com/dataset/ff33c576e11e20d0c3a77853f8bf473e066664f572ad1659bec66ea334221e74/version/1

# Load TextDataset

In [3]:
def filter_train(line):
    split_line = tf.strings.split(line, ",", maxsplit=4)
    dataset_belonging = split_line[1] # train, test
    sentiment_category = split_line[2] # pos, neg, unsup

    return (
        True
        if dataset_belonging == 'train' and sentiment_category != 'unsup'
        else False
    )

In [4]:
def filter_test(line):
    split_line = tf.strings.split(line, ",", maxsplit=4)
    dataset_belonging = split_line[1] # train, test
    sentiment_category = split_line[2] # pos, neg, unsup

    return (
        True
        if dataset_belonging == 'test' and sentiment_category != 'unsup'
        else False
    )

In [5]:
ds_train = tf.data.TextLineDataset("imdb.csv").filter(filter_train)
ds_test = tf.data.TextLineDataset("imdb.csv").filter(filter_test)

# TODO
1. Create vocabulary
2. Numericalize text str -> indices (TokenTextEncoder)
3. Pad the batches so we can send in to an RNN for example

In [6]:
tokenizer = tfds.deprecated.text.Tokenizer()

'i love banana' -> ['i', 'love', 'banana'] -> [0, 1, 2]

In [7]:
def build_vocabulary(ds_train, threshold=200):
    """ Build a vocabulary """
    frequencies = {}
    vocabulary = set()
    vocabulary.update(('sostoken'))
    vocabulary.update(('eostoken'))

    for line in ds_train.skip(1):
        split_line = tf.strings.split(line, ',', maxsplit=4)
        review = split_line[4]
        tokenized_text = tokenizer.tokenize(review.numpy().lower())

        for word in tokenized_text:
            if word not in frequencies:
                frequencies[word] = 1
            
            else:
                frequencies[word] += 1
            
            # if we've reached the threshold
            if frequencies[word] == threshold:
                vocabulary.update(tokenized_text)
    return vocabulary



# Build vocabulary and save

In [8]:
vocabulary = build_vocabulary(ds_train)
vocab_file = open('vocabulary.obj', 'wb')
pickle.dump(vocabulary, vocab_file)

# Loading the vocabulary

In [9]:
vocab_file = open('vocabulary.obj', 'rb')
vocabulary = pickle.load(vocab_file)

In [10]:
encoder = tfds.deprecated.text.TokenTextEncoder(
    list(vocabulary), oov_token='<UNK>', lowercase=True, tokenizer=tokenizer,
)

In [11]:
def my_encoder(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

In [12]:
def encode_map_fn(line):
    split_line = tf.strings.split(line, ',', maxsplit=4)
    label_str = split_line[2] # neg, pos
    review = 'sostoken' + split_line[4] + 'eostoken'
    label = 1 if label_str == 'pos' else 0

    (encoded_text, label) = tf.py_function(
        my_encoder, inp=[review, label], Tout=(tf.int64, tf.int32),
    )
    encoded_text.set_shape([None])
    label.set_shape([])
    return encoded_text, label

In [13]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
ds_train = ds_train.map(encode_map_fn, num_parallel_calls=AUTOTUNE).cache()
ds_train = ds_train.shuffle(25000)
ds_train = ds_train.padded_batch(32, padded_shapes=([None],()))

In [14]:
model = keras.Sequential([
    layers.Masking(mask_value=0),
    layers.Embedding(input_dim=len(vocabulary)+2, output_dim=32,),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1),
])

In [15]:
model.compile(
    loss = keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer = keras.optimizers.Adam(lr=3e-4, clipnorm=1),
    metrics = ['accuracy'],
)

model.fit(ds_train, epochs=15, verbose=2)

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/15
782/782 - 36s - loss: 0.6751 - accuracy: 0.5072 - 36s/epoch - 46ms/step
Epoch 2/15
782/782 - 2s - loss: 0.4974 - accuracy: 0.7254 - 2s/epoch - 3ms/step
Epoch 3/15
782/782 - 2s - loss: 0.3448 - accuracy: 0.8559 - 2s/epoch - 3ms/step
Epoch 4/15
782/782 - 2s - loss: 0.2790 - accuracy: 0.8876 - 2s/epoch - 3ms/step
Epoch 5/15
782/782 - 2s - loss: 0.2427 - accuracy: 0.9042 - 2s/epoch - 3ms/step
Epoch 6/15
782/782 - 3s - loss: 0.2160 - accuracy: 0.9172 - 3s/epoch - 3ms/step
Epoch 7/15
782/782 - 3s - loss: 0.1929 - accuracy: 0.9266 - 3s/epoch - 3ms/step
Epoch 8/15
782/782 - 3s - loss: 0.1758 - accuracy: 0.9344 - 3s/epoch - 3ms/step
Epoch 9/15
782/782 - 2s - loss: 0.1593 - accuracy: 0.9426 - 2s/epoch - 3ms/step
Epoch 10/15
782/782 - 3s - loss: 0.1445 - accuracy: 0.9495 - 3s/epoch - 3ms/step
Epoch 11/15
782/782 - 3s - loss: 0.1322 - accuracy: 0.9548 - 3s/epoch - 3ms/step
Epoch 12/15
782/782 - 3s - loss: 0.1202 - accuracy: 0.9601 - 3s/epoch - 3ms/step
Epoch 13/15
782/782 - 3s - loss: 0

<keras.callbacks.History at 0x7fd7166447d0>

#  Dataset in Several Files
Example if you have multiple files

In [16]:
file_names = ['test_example1.csv','test_example2.csv','test_example3.csv']
dataset = tf.data.TextLineDataset(file_names)

dataset1 = tf.data.TextLineDataset('test_example1.csv').skip(1)#.map(preprocess1)
dataset2 = tf.data.TextLineDataset('test_example2.csv').skip(2)#.map(preprocess2)
dataset3 = tf.data.TextLineDataset('test_example3.csv').skip(3)#.map(preprocess3)

dataset = dataset1.concatenate(dataset2).concatenate(dataset3)

for line in dataset:
    print(line)

tf.Tensor(b'0,test,neg,0_2.txt,"Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner\'s character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he\'s better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher\'s ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."', shape=(), dtype=string)
tf.Tensor(b'1,test,neg,10000_4.txt,"T

# Sketch Load Translation Dataset


In [17]:
tokenizer = tfds.deprecated.text.Tokenizer()

english = tf.data.TextLineDataset("english.csv")
swedish = tf.data.TextLineDataset("swedish.csv")
dataset = tf.data.Dataset.zip((english, swedish))

for eng, swe in dataset.skip(1):
    print(tokenizer.tokenize(eng.numpy()))
    print(tokenizer.tokenize(swe.numpy().decode("UTF-8")))

['i', 'love', 'tuna']
['jag', 'älskar', 'tonfisk']
['i', 'love', 'potato']
['jag', 'älskar', 'potatis']
['i', 'love', 'bacon']
['jag', 'älskar', 'bacon']


# TODO:
1. vocabulary (for each language)
2. tokenize and numericalize words
3. padded_batch, create model