In [1]:
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.data import Dataset
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Embedding,
    Dense,
    LSTM,
    Input,
    GlobalAveragePooling1D,
    Dropout,
    BatchNormalization,
    LayerNormalization,
    MultiHeadAttention,
    Add
)
import numpy as np

In [16]:
print("Built with CUDA:", tf.test.is_built_with_cuda())

Built with CUDA: False


In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [3]:
window_size = 128
stride = 1
embedding_dim = 512
num_layers = 16
num_heads = 8
ff_dim = 1024

In [4]:
def load_essays_from_folder(folder_path):
    essays = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f:
                essays.append(f.read())
    return essays 

essays = load_essays_from_folder('rawdata/essays')
essays[:5]

['I remember the first time I grasped the power of natural language processing (NLP). In 9th grade, I was toying around with Google Translate when I began wondering how it works. To an end user, it might look simple—type in a sentence, hit enter, and voilà, the machine gives you a translation for any of its 234 languages. I couldn\'t help but think, How is this even possible? Language is so messy and full of nuances.\nThis fascination resonated with my love for math—I find myself searching for numerical patterns, whether factoring street numbers or playing "24" with words (where A=1, B=2, and so on). Through NLP, I discovered how computers could transform language into mathematical representations, combining my love of patterns with real-world impact.\nMy passion grew through hands-on projects. After completing Stanford\'s NLP course on Coursera, I sought mentorship from Krishna Chintalapudi, a Principal Researcher at Microsoft, to improve text classification techniques. For Technology

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(essays)

In [6]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

2273

In [7]:
def generate_sliding_window_data(sequence):
    inputs = []
    labels = []

    for i in range(0, len(sequence) - window_size, stride):
        window_sequence = sequence[i:i + window_size]
        next_tokens = sequence[i + 1:i + window_size + 1]  # Shift by one for labels
        inputs.append(window_sequence)
        labels.append(next_tokens)

    return inputs, labels

In [8]:
def prepare_data(essays):
    inputs = []
    labels = []

    for essay in essays:
        tokenized_essay = tokenizer.texts_to_sequences([essay])[0]
        essay_inputs, essay_labels = generate_sliding_window_data(tokenized_essay)
        inputs.extend(essay_inputs)
        labels.extend(essay_labels)

    return np.array(inputs), np.array(labels)

In [9]:
inputs, labels= prepare_data(essays)

In [10]:
train_dataset = Dataset.from_tensor_slices((inputs, labels))
train_dataset = train_dataset.shuffle(10000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)

In [11]:
for batch in train_dataset.take(1):
    print(batch)
    input_shape = batch[0].shape
    label_shape = batch[1].shape
    print(f'Input shape: {input_shape}')
    print(f'Label shape: {label_shape}')

(<tf.Tensor: shape=(8, 128), dtype=int32, numpy=
array([[   7,   14,   45, ...,  144,    3,  532],
       [ 671,    9, 2227, ...,   13,   23,  109],
       [ 217, 1440,   10, ...,    7, 1467,  997],
       ...,
       [ 413,   24,  229, ...,   14,  128,    8],
       [1516,   20,  476, ...,  142, 1539,  202],
       [ 625,  482,   11, ...,    5, 1691,    1]])>, <tf.Tensor: shape=(8, 128), dtype=int32, numpy=
array([[  14,   45,  885, ...,    3,  532,    7],
       [   9, 2227,    1, ...,   23,  109,   71],
       [1440,   10,    5, ..., 1467,  997,  789],
       ...,
       [  24,  229,    4, ...,  128,    8,    6],
       [  20,  476,  805, ..., 1539,  202,  478],
       [ 482,   11,    4, ..., 1691,    1,  371]])>)
Input shape: (8, 128)
Label shape: (8, 128)


In [12]:
inputs = Input(shape=(window_size,))
embedding = Embedding(vocab_size, embedding_dim)(inputs) 
positional_encoding = Embedding(input_dim=window_size, output_dim=embedding_dim)(tf.range(start=0, limit=window_size))
positional_encoding = tf.expand_dims(positional_encoding, 0)
embedding = Add()([embedding, positional_encoding])

x = embedding

for _ in range(num_layers):
    attn = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim // num_heads)(x, x)
    x = Add()([x, attn])
    x = LayerNormalization()(x)

    ff = Dense(ff_dim, activation='relu')(x)
    ff = Dense(embedding_dim)(ff)
    x = Add()([x, ff])
    x = LayerNormalization()(x)

outputs = Dense(vocab_size, activation='softmax')(x)
model = Model(inputs, outputs)

In [13]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 128, 512)             1163776   ['input_1[0][0]']             
                                                                                                  
 add (Add)                   (None, 128, 512)             0         ['embedding[0][0]']           
                                                                                                  
 multi_head_attention (Mult  (None, 128, 512)             1050624   ['add[0][0]',                 
 iHeadAttention)                                                     'add[0][0]']             

In [15]:
model.fit(train_dataset, epochs=100)

Epoch 1/100
 17/811 [..............................] - ETA: 15:27 - loss: 6.6843 - accuracy: 0.0317

KeyboardInterrupt: 