In [None]:
def tokenize(text):
    tokens = text.lower().split()
    return tokens

def create_word_dictionary(word_list):
    # Create an empty dictionary
    word_dict = {}
    word_dict["UNK"] = 0
    # Counter for unique values
    counter = 1

    # Iterate through the list and assign numbers to unique words
    for word in word_list:
        if word not in word_dict:
            word_dict[word] = counter
            counter += 1

    return word_dict

def text_to_sequence(sentence, word_dict):
    # Convert sentence to lowercase and split into words
    words = sentence.lower().strip().split()

    # Convert each word to its corresponding number
    number_sequence = [word_dict[word] for word in words]

    return number_sequence

def pad_sequences(sequences, max_length=None):
    # If max_length is not specified, find the length of the longest sequence
    if max_length is None:
        max_length = max(len(seq) for seq in sequences)

    # Pad each sequence with zeros at the beginning
    padded_sequences = []
    for seq in sequences:
        # Calculate number of zeros needed
        num_zeros = max_length - len(seq)
        # Create padded sequence
        padded_seq = [0] * num_zeros + list(seq)
        padded_sequences.append(padded_seq)

    return padded_sequences

def split_sequences(sequences):
    # Create xs by removing the last element from each sequence
    xs = [seq[:-1] for seq in sequences]

    # Create labels by taking just the last element from each sequence
    labels = [seq[-1:] for seq in sequences]  # Using [-1:] to keep it as a single-element list
    # Alternative if you want labels as single numbers instead of lists:
    # labels = [seq[-1] for seq in sequences]

    return xs, labels

def one_hot_encode_with_checks(value, corpus_size):
    # Check if value is within valid range
    if not 0 <= value < corpus_size:
        raise ValueError(f"Value {value} is out of range for corpus size {corpus_size}")
    # Create and return one-hot encoded list
    encoded = [0] * corpus_size
    encoded[value] = 1
    return encoded


In [None]:
data="In the town of Athy one Jeremy Lanigan \n Battered away til he hadnt a pound. \nHis father died and made him a man again \n Left him a farm and ten acres of ground. \nHe gave a grand party for friends and relations \nWho didnt forget him when come to the wall, \nAnd if youll but listen Ill make your eyes glisten \nOf the rows and the ructions of Lanigans Ball. \nMyself to be sure got free invitation, \nFor all the nice girls and boys I might ask, \nAnd just in a minute both friends and relations \nWere dancing round merry as bees round a cask. \nJudy ODaly, that nice little milliner, \nShe tipped me a wink for to give her a call, \nAnd I soon arrived with Peggy McGilligan \nJust in time for Lanigans Ball. \nThere were lashings of punch and wine for the ladies, \nPotatoes and cakes; there was bacon and tea, \nThere were the Nolans, Dolans, OGradys \nCourting the girls and dancing away. \nSongs they went round as plenty as water, \nThe harp that once sounded in Taras old hall,\nSweet Nelly Gray and The Rat Catchers Daughter,\nAll singing together at Lanigans Ball. \nThey were doing all kinds of nonsensical polkas \nAll round the room in a whirligig. \nJulia and I, we banished their nonsense \nAnd tipped them the twist of a reel and a jig. \nAch mavrone, how the girls got all mad at me \nDanced til youd think the ceiling would fall. \nFor I spent three weeks at Brooks Academy \nLearning new steps for Lanigans Ball. \nThree long weeks I spent up in Dublin, \nThree long weeks to learn nothing at all,\n Three long weeks I spent up in Dublin, \nLearning new steps for Lanigans Ball. \nShe stepped out and I stepped in again, \nI stepped out and she stepped in again, \nShe stepped out and I stepped in again, \nLearning new steps for Lanigans Ball. \nBoys were all merry and the girls they were hearty \nAnd danced all around in couples and groups, \nTil an accident happened, young Terrance McCarthy \nPut his right leg through miss Finnertys hoops. \nPoor creature fainted and cried Meelia murther, \nCalled for her brothers and gathered them all. \nCarmody swore that hed go no further \nTil he had satisfaction at Lanigans Ball. \nIn the midst of the row miss Kerrigan fainted, \nHer cheeks at the same time as red as a rose. \nSome of the lads declared she was painted, \nShe took a small drop too much, I suppose. \nHer sweetheart, Ned Morgan, so powerful and able, \nWhen he saw his fair colleen stretched out by the wall, \nTore the left leg from under the table \nAnd smashed all the Chaneys at Lanigans Ball. \nBoys, oh boys, twas then there were runctions. \nMyself got a lick from big Phelim McHugh. \nI soon replied to his introduction \nAnd kicked up a terrible hullabaloo. \nOld Casey, the piper, was near being strangled. \nThey squeezed up his pipes, bellows, chanters and all. \nThe girls, in their ribbons, they got all entangled \nAnd that put an end to Lanigans Ball."

tokens = tokenize(data)
word_index = create_word_dictionary(tokens)
print(len(word_index))



In [None]:
print(word_index)

In [None]:
print(tokens)

In [None]:
corpus = data.lower().split("\n")

input_sequences = []
for line in corpus:
    token_list = text_to_sequence(line, word_index)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, max_sequence_len)
print(input_sequences)

In [None]:
xs, labels = split_sequences(input_sequences)
print(xs)
print(labels)

In [None]:
vocab_size = len(word_index)
ys = []
for y in labels:
  ys.append(one_hot_encode_with_checks(y[0], vocab_size))
print(ys[0])

In [None]:
import torch
import torch.nn as nn

class LSTMPredictor(nn.Module):
    def __init__(self, total_words, embedding_dim=8, hidden_dim=None):
        super(LSTMPredictor, self).__init__()

        # If hidden_dim not specified, use max_sequence_len-1 as in TF version
        if hidden_dim is None:
            hidden_dim = max_sequence_len-1

        # Embedding layer
        self.embedding = nn.Embedding(total_words, embedding_dim)

        # Bidirectional LSTM
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            bidirectional=True,
            batch_first=True
        )

        # Final dense layer (accounting for bidirectional LSTM)
        self.fc = nn.Linear(hidden_dim * 2, total_words)

        # Softmax activation
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # Embedding layer
        x = self.embedding(x)

        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Take the output from the last time step
        lstm_out = lstm_out[:, -1, :]

        # Dense layer
        out = self.fc(lstm_out)

        # Softmax activation
        out = self.softmax(out)

        return out

# Training setup
total_words = len(word_index)
model = LSTMPredictor(total_words)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Convert data to PyTorch tensors
# Assuming xs and ys are numpy arrays
xs_tensor = torch.LongTensor(xs)
ys_tensor = torch.FloatTensor(ys)

import matplotlib.pyplot as plt

# Lists to store metrics
train_losses = []
train_accuracies = []

# Training loop with accuracy tracking
num_epochs = 15000
model.train()

for epoch in range(num_epochs):
    # Forward pass
    outputs = model(xs_tensor)
    loss = criterion(outputs, ys_tensor)

    # Calculate accuracy
    _, predicted = torch.max(outputs.data, 1)
    _, targets = torch.max(ys_tensor, 1)

    correct = (predicted == targets).sum().item()
    total = targets.size(0)
    accuracy = 100 * correct / total

    # Store metrics
    train_losses.append(loss.item())
    train_accuracies.append(accuracy)

    # Backward pass and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print progress every 100 epochs
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Loss: {loss.item():.4f}, '
              f'Accuracy: {accuracy:.2f}%')

# Plot training metrics
plt.figure(figsize=(12, 4))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracies)
plt.title('Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')

plt.tight_layout()
plt.show()

# Print final metrics
print(f'\nFinal Results:')
print(f'Loss: {train_losses[-1]:.4f}')
print(f'Accuracy: {train_accuracies[-1]:.2f}%')

In [None]:
def predict_next_word(model, input_text, word_dict, sequence_length):
    # Set model to evaluation mode
    model.eval()

    # Convert text to lowercase and split into words
    words = input_text.lower().strip().split()

    # Convert words to numbers using the word dictionary, use 0 for unknown words
    number_sequence = [word_dict.get(word, 0) for word in words]

    # Pad the sequence
    padded_sequence = [0] * (sequence_length - len(number_sequence)) + number_sequence

    # Convert to PyTorch tensor and add batch dimension
    input_tensor = torch.LongTensor([padded_sequence])

    # Get prediction
    with torch.no_grad():  # No need to track gradients for prediction
        output = model(input_tensor)

    # Get the predicted word index (highest probability)
    predicted_idx = torch.argmax(output[0]).item()
    print(predicted_idx)
    # Create reverse dictionary to convert number back to word
    reverse_dict = {v: k for k, v in word_dict.items()}

    # Convert predicted index to word
    predicted_word = reverse_dict[predicted_idx]

    # Print warning for any unknown words
    unknown_words = [word for word in words if word not in word_dict]
    if unknown_words:
        print(f"Warning: Unknown words found and treated as padding: {unknown_words}")

    return predicted_word, output[0]

# Example usage:
input_text = "sweet jeremy saw dublin"  # Try adding some words that aren't in word_dict
next_word, probabilities = predict_next_word(model, input_text, word_index, max_sequence_len)

print(f"Input text: {input_text}")
print(f"Predicted next word: {next_word}")

# Print top 5 most likely words
_, top_indices = torch.topk(probabilities, 5)
reverse_dict = {v: k for k, v in word_index.items()}
print("\nTop 5 predictions:")
for idx in top_indices:
    word = reverse_dict[idx.item()]
    probability = probabilities[idx].item()
    print(f"{word}: {probability:.4f}")

In [None]:
def generate_sequence(model, initial_text, word_dict, sequence_length, num_words=10):
    # Set model to evaluation mode
    model.eval()

    # Start with the initial text
    current_text = initial_text
    generated_sequence = initial_text

    # Create reverse dictionary for converting numbers back to words
    reverse_dict = {v: k for k, v in word_dict.items()}

    print(f"Initial text: {initial_text}")

    for i in range(num_words):
        # Convert current text to lowercase and split into words
        words = current_text.lower().strip().split()

        # Take the last 'sequence_length' words if we exceed it
        if len(words) > sequence_length:
            words = words[-sequence_length:]

        # Convert words to numbers using the word dictionary, use 0 for unknown words
        number_sequence = [word_dict.get(word, 0) for word in words]

        # Pad the sequence
        padded_sequence = [0] * (sequence_length - len(number_sequence)) + number_sequence

        # Convert to PyTorch tensor and add batch dimension
        input_tensor = torch.LongTensor([padded_sequence])

        # Get prediction
        with torch.no_grad():
            output = model(input_tensor)

        # Get the predicted word index (highest probability)
        predicted_idx = torch.argmax(output[0]).item()

        # Convert predicted index to word
        predicted_word = reverse_dict[predicted_idx]

        # Add the predicted word to the sequence
        generated_sequence += " " + predicted_word

        # Update current text for next prediction
        current_text = generated_sequence

        # Print progress
        print(f"Generated word {i+1}: {predicted_word}")

        # Optionally print top 5 predictions for each step
        _, top_indices = torch.topk(output[0], 5)
        print(f"\nTop 5 predictions for step {i+1}:")
        for idx in top_indices:
            word = reverse_dict[idx.item()]
            probability = output[0][idx].item()
            print(f"{word}: {probability:.4f}")
        print("\n" + "-"*50 + "\n")

    return generated_sequence

# Example usage:
initial_text = "sweet jeremy saw dublin"
generated_text = generate_sequence(
    model=model,
    initial_text=initial_text,
    word_dict=word_index,
    sequence_length=max_sequence_len,
    num_words=10
)

print("\nFinal generated sequence:")
print(generated_text)