# Use Case Demonstration of LexiVerse, idea is to train a small generative model (from scratch using a fraction of my dataset) using LSTM to show that this dataset can be used for fine tuning LLMs or any GPTs 

In [36]:
import os
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re
import gc

# Step 1: Load and Preprocess Data in Batches
# I am selecting only 200 words per file for a faster training
def load_and_preprocess_data(path, words_per_file=200):
    texts = []
    for file in os.listdir(path):
        if file.endswith('.txt'):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
                text = f.read().lower()
                # Basic text cleaning
                text = re.sub(r'[^\w\s]', '', text)
                # Take only the first 500 words
                words = text.split()[:words_per_file]
                texts.extend(words)
    return ' '.join(texts)

# Load all text files, taking only the first 200 words from each
data_path = '/kaggle/input/lexi-verse/'
corpus = load_and_preprocess_data(data_path)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in corpus.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
    gc.collect()

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = to_categorical(y, num_classes=total_words)

model = Sequential()
model.add(Embedding(total_words, 200, input_length=max_sequence_len-1))
model.add(LSTM(200, return_sequences=False))  
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [43]:
print (total_words)
print (X)
print (y)

2146
[[   0    0    0 ...    0    0    2]
 [   0    0    0 ...    0    2  665]
 [   0    0    0 ...    2  665    4]
 ...
 [   0    0    2 ... 2142 2143 2144]
 [   0    2  665 ... 2143 2144 2145]
 [   2  665    4 ... 2144 2145  664]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [38]:
# Training the Model in Batches
batch_size = 64
def batch_generator(X, y, batch_size):
    while True:
        for i in range(0, len(X), batch_size):
            yield X[i:i+batch_size], y[i:i+batch_size]

model.fit(batch_generator(X, y, batch_size), steps_per_epoch=len(X)//batch_size, epochs=60, verbose=1)

Epoch 1/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 265ms/step - accuracy: 0.0130 - loss: 7.6667
Epoch 2/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 265ms/step - accuracy: 0.0419 - loss: 7.5287
Epoch 3/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 263ms/step - accuracy: 0.0066 - loss: 7.6381
Epoch 4/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 264ms/step - accuracy: 0.0145 - loss: 7.5681
Epoch 5/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 264ms/step - accuracy: 0.0188 - loss: 7.4075
Epoch 6/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 264ms/step - accuracy: 0.0190 - loss: 7.2574
Epoch 7/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 264ms/step - accuracy: 0.0175 - loss: 7.1415
Epoch 8/60
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 264ms/step - accuracy: 0.0181 - loss: 7.2453
Epoch 9/60
[1m62/62[0m [32m━━

<keras.src.callbacks.history.History at 0x7f798741e470>

In [56]:
# Step 4: Text Generation Function
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Demonstration
print("Generating text from different seeds:")
seeds = [
    
    "The environment is crucial because",
    "Health is important because"
]

for seed in seeds:
    generated_text = generate_text(seed, 6, model, max_sequence_len)
    print(f"\nSeed: {seed}")
    print(f"Generated: {generated_text}")


Generating text from different seeds:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step

Seed: The environment is crucial because
Generated: The environment is crucial because self decade ago says already destination
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

## Could be improved using Word2Vec or GLOVE and also due to preprocessing our dataset do not ober grammar of English language. 
## httpswwwhealthcom is printed due to multiple links as metadata in our file on which our model is trained

## This code runs only a sampled version of the dataset, still performs decent enough without any hyperparameter tuning. So, the full dataset can be used for Fine Tuning any SOTA GPTs.