In [1]:
# Import necessary libraries and modules. TensorFlow and Keras for model building, Pandas for data manipulation,
# Numpy for numerical operations, and regular expressions for text processing.
import tensorflow as tf
import pandas as pd
import numpy as np
import re
from pathlib import Path
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, LSTM, Dense
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint




In [2]:
# Define the path to the dataset, ensuring portability and flexibility in file location.
data_path = Path('b.csv')
# Load the dataset using Pandas, specifying the encoding to handle any non-standard characters.
df = pd.read_csv(data_path, encoding='ISO-8859-1')

In [3]:
# Define a function to preprocess the text data by converting to lowercase, removing extra whitespace,
# and stripping punctuation. This standardization aids in reducing the complexity of the model's input space.
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [4]:
# Apply the cleaning function to the dataset
df['Cleaned_Review'] = df['Review'].apply(clean_text)
# Concatenate all the cleaned reviews into a single string
text_corpus = ' '.join(df['Cleaned_Review'].astype(str).tolist())

In [5]:
# Character-level tokenization
chars = sorted(list(set(text_corpus)))
char_to_index = dict((c, i) for i, c in enumerate(chars))
index_to_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
# Set parameters for data processing and model training. 
batch_size = 128
max_length = 150
step = 5

# Define a custom data generator class that inherits from Keras' Sequence class. This allows for efficient
# data handling when the dataset is too large to fit into memory, by generating data in batches on-the-fly.
class CharGenerator(Sequence):
    """
    Custom data generator for text sequence generation. It provides a scalable way to feed data into the model
    during training, especially useful for large data sets.

    Attributes:
        text (str): The full text corpus from which to generate sequences.
        char_to_index (dict): Mapping of characters to integer indices for one-hot encoding.
        batch_size (int): Number of sequences per batch.
        max_length (int): Length of each input sequence.
        step (int): Number of characters to step in the corpus for the next sequence.
        chars (list): List of unique characters in the corpus.
        num_sequences (int): Total number of sequences that can be generated from the corpus.
        num_characters (int): Total number of unique characters in the corpus.
    """
    def __init__(self, text, char_to_index, batch_size, max_length, step):
        self.text = text
        self.char_to_index = char_to_index
        self.batch_size = batch_size
        self.max_length = max_length
        self.step = step
        self.chars = sorted(list(set(text)))
        self.num_sequences = (len(text) - max_length) // step
        self.num_characters = len(self.chars)
    
    def __len__(self):
        # Calculate the number of batches by dividing the total number of sequences by the batch size.
        return (self.num_sequences - 1) // self.batch_size + 1

    def __getitem__(self, idx):
        """
        Generates and returns a batch of input sequences and their corresponding target characters
        for the given batch index.

        Args:
            idx (int): Index of the batch.

        Returns:
            A tuple (x, y), where x is a batch of input sequences and y is a batch of target characters.
        """
        start_index = idx * self.batch_size * self.step
        x = np.zeros((self.batch_size, self.max_length, self.num_characters), dtype=np.bool_)
        y = np.zeros((self.batch_size, self.num_characters), dtype=np.bool_)
        for i in range(self.batch_size):
            for t, char in enumerate(self.text[start_index + i*self.step : start_index + i*self.step + self.max_length]):
                x[i, t, self.char_to_index[char]] = 1
            next_char_index = start_index + i*self.step + self.max_length
            if next_char_index < len(self.text):  # Check to prevent index error
                next_char = self.text[next_char_index]
                y[i, self.char_to_index[next_char]] = 1
        return x, y

# Define the neural network architecture for sequence modeling. The model uses LSTM layers
# for sequence learning, with dropout layers to reduce overfitting, and a dense output layer with
# a softmax activation function for predicting the next character in the sequence.
model = Sequential([
    LSTM(128, input_shape=(max_length, len(chars))), 
    Dropout(0.2), 
    Dense(len(chars), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model
model.summary()

# Defining callbacks for early stopping and model checkpointing
callbacks = [
    EarlyStopping(patience=5, monitor='loss', restore_best_weights=True),
    ModelCheckpoint(filepath='best_model.h5', monitor='loss', save_best_only=True)
]

# Initialize generator
generator = CharGenerator(text_corpus, char_to_index, batch_size, max_length, step)

# Fit the model
model.fit(generator, epochs=20, callbacks=callbacks)

# Save the final model
model.save('my_model3.h5')



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               94720     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 56)                7224      
                                                                 
Total params: 101944 (398.22 KB)
Trainable params: 101944 (398.22 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/20


  184/22672 [..............................] - ETA: 45:17 - loss: 3.0520 - accuracy: 0.1427

KeyboardInterrupt: 