### NLP

Previous example: [/examples/keras_applications/transfer_learning.ipynb](https://github.com/serhatsoyer/py4ML/blob/main/examples/keras_applications/transfer_learning.ipynb)  
Modified from: [NLP section of Tensorflow Udemy Course from Jose Portilla - Pierian Training](https://www.udemy.com/course/complete-tensorflow-2-and-keras-deep-learning-bootcamp/)  
Next example: [/examples/autoencoders/intro.ipynb](https://github.com/serhatsoyer/py4ML/blob/main/examples/autoencoders/intro.ipynb)

In [1]:
import sys
sys.path.insert(0, '../../') # To be able to reach 'datasets' folder
from pathlib import Path
import re
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, Dropout
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import EarlyStopping

In [2]:
dataset_path = Path.cwd().parent.parent / 'datasets' / 'dostoyevski'
book_names = []
[book_names.append(item) for item in dataset_path.iterdir() if item.suffix == '.txt']
print(book_names)

[PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/The Brothers Karamazov.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/The Idiot.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/The Possessed.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/Poor Folk.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/Crime and Punishment.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/Notes from the Underground.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/White Nights and Other Stories.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/Short Stories.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/The House of the Dead.txt'), PosixPath('/Users/serhatsoyer/Repos/py4ML/datasets/dostoyevski/The Gambler.txt')]


In [3]:
text = []
for book_name in book_names:
    file = open(dataset_path / book_name, 'r')
    text.append(file.read())
    file.close()

del dataset_path, book_name, book_names, file
text = ' '.join(text)
def print_text(text): print(f'{len(text) = }\n{text[:95] = }')
print_text(text)

len(text) = 8474359
text[:95] = 'The Brothers Karamazov\n\nPART I\n\n\n\n\nBook I. The History Of A Family\n\n\n\n\nChapter I.\nFyodor Pavlov'


In [4]:
def get_chars(): chars = sorted(set(text)); print(f"{len(chars) = }\n{''.join(chars) = }"); return chars
chars = get_chars()
text = text.lower()
chars = get_chars()
for temp in '123456789': text = text.replace(temp, '0')
for temp in '?!;:': text = text.replace(temp, '.')
text = text.replace('\n', ' ')
chars = get_chars()
text = text.encode('ascii', errors='ignore').decode('utf-8', errors='ignore')
chars = get_chars()
for temp in chars: text = text if temp in 'abcdefghijklmnopqrstuvwxyz0.,* ' else text.replace(temp, '*')
chars = get_chars()
char_to_idx = {char: idx for idx, char in enumerate(chars)}
idx_to_char = np.array(chars)
print_text(text)
for temp in [' ', '0', '\.', '\,', '\*']: text = re.sub(f'{temp}+', f'{temp if len(temp) == 1 else temp[-1]}', text)
print_text(text)
del temp

len(chars) = 108
''.join(chars) = '\n !"\'()*,-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÀÆÈÉÏàâäæçèéêëîïôöùüŒœ‐—‘’“”'
len(chars) = 76
''.join(chars) = '\n !"\'()*,-.0123456789:;=?[]_abcdefghijklmnopqrstuvwxyzàâäæçèéêëîïôöùüœ‐—‘’“”'
len(chars) = 62
''.join(chars) = ' "\'()*,-.0=[]_abcdefghijklmnopqrstuvwxyzàâäæçèéêëîïôöùüœ‐—‘’“”'
len(chars) = 40
''.join(chars) = ' "\'()*,-.0=[]_abcdefghijklmnopqrstuvwxyz'
len(chars) = 31
''.join(chars) = ' *,.0abcdefghijklmnopqrstuvwxyz'
len(text) = 8414469
text[:95] = 'the brothers karamazov  part i     book i. the history of a family     chapter i. fyodor pavlov'
len(text) = 8362974
text[:95] = 'the brothers karamazov part i book i. the history of a family chapter i. fyodor pavlovitch kara'


In [5]:
seq_len = 128
batch_size = 256
buffer_size = 15000 # Not a critical value
encoded = np.array([char_to_idx[char] for char in text])
print(f'{encoded.shape = }, {encoded[:5] = }')
dataset = tf.data.Dataset.from_tensor_slices(encoded)
seqs = dataset.batch(seq_len + 1, drop_remainder=True)
def get_in_and_out(seq): return seq[:-1], seq[1:]
dataset = seqs.map(get_in_and_out)
print(f'Complete dataset length: {len(dataset) = }')
cutoff = round(len(dataset) / 16)
test = dataset.take(cutoff) 
train = dataset.skip(cutoff)
print(f'Train dataset length: {len(train) = }')
print(f'Test dataset length: {len(test) = }')
for input, target in train.take(1):
    print(input.numpy()[:5], input.numpy()[-5:], '\n', ''.join(idx_to_char[input.numpy()]))
    print(target.numpy()[:5], target.numpy()[-5:], '\n', ''.join(idx_to_char[target.numpy()]))

train = train.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
test = test.shuffle(buffer_size).batch(batch_size, drop_remainder=True)
del dataset, buffer_size, cutoff, encoded, input, target, seqs, text
train

encoded.shape = (8362974,), encoded[:5] = array([24, 12,  9,  0,  6])
Metal device set to: Apple M2

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB

Complete dataset length: len(dataset) = 64829
Train dataset length: len(train) = 60777
Test dataset length: len(test) = 4052


2022-12-26 12:46:30.640239: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-12-26 12:46:30.640672: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-12-26 12:46:30.705190: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


[19 19  8  6 29] [ 9 26  9 18  0] 
 oodby. and alyosha ran downstairs and into the street. chapter ii. smerdyakov with a guitar he had no time to lose indeed. even 
[19  8  6 29  3] [26  9 18  0 27] 
 odby. and alyosha ran downstairs and into the street. chapter ii. smerdyakov with a guitar he had no time to lose indeed. even w


<BatchDataset element_spec=(TensorSpec(shape=(256, 128), dtype=tf.int64, name=None), TensorSpec(shape=(256, 128), dtype=tf.int64, name=None))>

In [6]:
embed_size = 64
def sparse_cat_loss(y_true, y_pred): return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
def create_model(batch_size=batch_size):
    model = Sequential()
    model.add(Embedding(len(chars), embed_size, batch_input_shape=[batch_size, None]))
    model.add(GRU(256, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
    model.add(GRU(256, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(.25))
    model.add(Dense(len(chars)))
    model.compile(optimizer='adam', loss=sparse_cat_loss) 
    return model


model = create_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (256, None, 64)           1984      
                                                                 
 gru (GRU)                   (256, None, 256)          247296    
                                                                 
 gru_1 (GRU)                 (256, None, 256)          394752    
                                                                 
 dense (Dense)               (256, None, 256)          65792     
                                                                 
 dropout (Dropout)           (256, None, 256)          0         
                                                                 
 dense_1 (Dense)             (256, None, 31)           7967      
                                                                 
Total params: 717,791
Trainable params: 717,791
Non-trai

In [7]:
for input, target in train.take(1):
    pred_0 = model(input)
    pred_1 = tf.random.categorical(pred_0[0], num_samples=1)
    pred = tf.squeeze(pred_1, axis=-1).numpy()
    print(f'{input.shape = }\n{target.shape = }\n{pred_0.shape = }\n{pred_1.shape = }\n{pred.shape = }')
    print(f"{input[0].numpy()[:5]}, {input[0].numpy()[-5:]}\n{''.join(idx_to_char[input[0].numpy()])}")
    print(f"{target[0].numpy()[:5]}, {target[0].numpy()[-5:]}\n{''.join(idx_to_char[target[0].numpy()])}")
    print(f"{pred[:5]}, {pred[-5:]}\n{''.join(idx_to_char[pred])}")

del input, target, pred_0, pred_1, pred

input.shape = TensorShape([256, 128])
target.shape = TensorShape([256, 128])
pred_0.shape = TensorShape([256, 128, 31])
pred_1.shape = TensorShape([128, 1])
pred.shape = (128,)
[27  0 11 16  5], [ 0 27  5 23  0]
w glad she will be, how delighted. he muttered, but lapsed into silence again. and indeed it was not to please grushenka he was 
[ 0 11 16  5  8], [27  5 23  0 24]
 glad she will be, how delighted. he muttered, but lapsed into silence again. and indeed it was not to please grushenka he was t
[17 15 27 16  8], [20 30  4 26 13]
mkwldbrbhvgsiyrm0xa sh0xiltojry*km0q zdjpfujyf*kmbkkvrz nvaobzkwmmsab y0qcwfnsbmi*n,vts0sxpmtr ad,emsytn,cxowmt,xaf.pnazxlgpz0vi


In [8]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4, restore_best_weights=True)
model.fit(train, validation_data=test, callbacks=[early_stop], epochs=128)
del early_stop
model_name = 'dost1.h5'
model.save(model_name)

Epoch 1/128


2022-12-26 12:46:34.080325: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-26 12:46:35.328979: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-26 12:46:35.569703: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-26 12:46:35.929443: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-26 12:46:36.287898: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2022-12-26 12:47:26.906242: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-26 12:47:27.236956: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-12-26 12:47:27.493480: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 19: early stopping


In [9]:
test_model = create_model(1)
test_model.load_weights(model_name)
test_model.build(tf.TensorShape([1, None]))

In [10]:
def dostoyevski_writes(model, seed):
    input = tf.expand_dims([char_to_idx[char] for char in seed], 0)
    output = []
    model.reset_states()
    for _ in range(512):
        preds = model(input)
        preds = tf.squeeze(preds, 0)
        preds = preds / 0.95 # Higher prob. val means lesss surprising
        pred_idx = tf.random.categorical(preds, num_samples=1)[-1, 0].numpy()
        input = tf.expand_dims([pred_idx], 0)
        output.append(idx_to_char[pred_idx])
    
    print(seed + ''.join(output))

In [11]:
dostoyevski_writes(test_model, 'poor')

poormanionaless for the shamperic child thas known on the wurnst to le*t one around his way a call. he ben* nothing mart all. they all ect is work and you as, the less confition. she here.* he elets and another. i did not wronge how surpally that ou*like at such that has a last. we she dare her the room. the might there would more roublesical shar. riched dehigg means and alexanov very days, ponky with a ran at the suddenly unders her crieds delicour* brohor feet, but one will grow more would say of factioned f


Previous example: [/examples/keras_applications/transfer_learning.ipynb](https://github.com/serhatsoyer/py4ML/blob/main/examples/keras_applications/transfer_learning.ipynb)  
Modified from: [NLP section of Tensorflow Udemy Course from Jose Portilla - Pierian Training](https://www.udemy.com/course/complete-tensorflow-2-and-keras-deep-learning-bootcamp/)  
Next example: [/examples/autoencoders/intro.ipynb](https://github.com/serhatsoyer/py4ML/blob/main/examples/autoencoders/intro.ipynb)