<a href="https://colab.research.google.com/github/ppokranguser/Artificial_Intelligence_study/blob/main/241128_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
!wget http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt

--2024-11-28 04:14:30--  http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35751 (35K) [text/plain]
Saving to: ‘female.txt’


2024-11-28 04:14:31 (265 KB/s) - ‘female.txt’ saved [35751/35751]



# **Data Preprocessing**

In [3]:
# Read the lines from the file
with open('female.txt', 'r') as f:
    lines = f.readlines()
print(lines)

['# List of common female names.\n', '# Copyright (c) January 1991 by Mark Kantrowitz.\n', '# 4987 names\n', '# Thanks to Bill.Ross for about 1000 additional names.\n', '# Version 1.3 (29-MAR-94)\n', '\n', 'Abagael\n', 'Abagail\n', 'Abbe\n', 'Abbey\n', 'Abbi\n', 'Abbie\n', 'Abby\n', 'Abigael\n', 'Abigail\n', 'Abigale\n', 'Abra\n', 'Acacia\n', 'Ada\n', 'Adah\n', 'Adaline\n', 'Adara\n', 'Addie\n', 'Addis\n', 'Adel\n', 'Adela\n', 'Adelaide\n', 'Adele\n', 'Adelice\n', 'Adelina\n', 'Adelind\n', 'Adeline\n', 'Adella\n', 'Adelle\n', 'Adena\n', 'Adey\n', 'Adi\n', 'Adiana\n', 'Adina\n', 'Adora\n', 'Adore\n', 'Adoree\n', 'Adorne\n', 'Adrea\n', 'Adria\n', 'Adriaens\n', 'Adrian\n', 'Adriana\n', 'Adriane\n', 'Adrianna\n', 'Adrianne\n', 'Adrien\n', 'Adriena\n', 'Adrienne\n', 'Aeriel\n', 'Aeriela\n', 'Aeriell\n', 'Ag\n', 'Agace\n', 'Agata\n', 'Agatha\n', 'Agathe\n', 'Aggi\n', 'Aggie\n', 'Aggy\n', 'Agna\n', 'Agnella\n', 'Agnes\n', 'Agnese\n', 'Agnesse\n', 'Agneta\n', 'Agnola\n', 'Agretha\n', 'Aida\n',

In [4]:
names = []
max_len = 0

# Iterate through the lines to process names
for l in lines[6:]:
    # Remove the trailing newline and convert to lowercase
    curr_name = l[:-1].lower()
    if curr_name.isalpha():
        names.append(curr_name)
        max_len = max(len(curr_name), max_len)

# Increase max_len to account for the end of signal (EOS)
# if name is "Tom" what exactly generated is "Tom<EndOfSignal>"
max_len += 1

print('Maximum Length : ' + str(max_len))
print(names)

Maximum Length : 14
['abagael', 'abagail', 'abbe', 'abbey', 'abbi', 'abbie', 'abby', 'abigael', 'abigail', 'abigale', 'abra', 'acacia', 'ada', 'adah', 'adaline', 'adara', 'addie', 'addis', 'adel', 'adela', 'adelaide', 'adele', 'adelice', 'adelina', 'adelind', 'adeline', 'adella', 'adelle', 'adena', 'adey', 'adi', 'adiana', 'adina', 'adora', 'adore', 'adoree', 'adorne', 'adrea', 'adria', 'adriaens', 'adrian', 'adriana', 'adriane', 'adrianna', 'adrianne', 'adrien', 'adriena', 'adrienne', 'aeriel', 'aeriela', 'aeriell', 'ag', 'agace', 'agata', 'agatha', 'agathe', 'aggi', 'aggie', 'aggy', 'agna', 'agnella', 'agnes', 'agnese', 'agnesse', 'agneta', 'agnola', 'agretha', 'aida', 'aidan', 'aigneis', 'aila', 'aile', 'ailee', 'aileen', 'ailene', 'ailey', 'aili', 'ailina', 'ailyn', 'aime', 'aimee', 'aimil', 'aina', 'aindrea', 'ainslee', 'ainsley', 'ainslie', 'ajay', 'alaine', 'alameda', 'alana', 'alanah', 'alane', 'alanna', 'alayne', 'alberta', 'albertina', 'albertine', 'albina', 'alecia', 'aleda'

# **Define a Dataset Class**

In [5]:
print(ord('a'), ord('b'), ord('z'))
print('Index of "c" :', ord('c')-ord('a'))

97 98 122
Index of "c" : 2


In [6]:
# Define a dataset class
class NameDataset(Dataset):
    def __init__(self, names, max_len):
        self.names = names
        self.max_len = max_len
        self.a_order = ord('a')
        self.z_order = ord('z')
        self.num_classes = 26 + 1 # a-z + include the end of signal

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        # Get the current name and convert to numerical values
        # names[idx] : current alphabet name ('abe')
        # curr_name : [0, 1, 4]
        curr_name = [ord(n)-self.a_order for n in names[idx]]

        # Buffer before padding 26(index of EOS) to the name info.
        # [26, 26, ... 26]
        padded_name = [self.num_classes-1 for _ in range(self.max_len)]
        # padded name : [0, 1, 4, 26, 26, 26, .... ]
        padded_name[:len(curr_name)] = curr_name

        # Prepare the sample
        sample = dict()
        sample['input'] = torch.LongTensor(padded_name[:-1]) # Input sequence
        sample['output'] = torch.LongTensor(padded_name[1:]) # Output sequence
        sample['length'] = len(names[idx]) # Length of the name
        sample['original'] = names[idx] # Original name string

        return sample

In [7]:
batch_size = 64
dataset = NameDataset(names, max_len)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [8]:
# Get a sample batch from the dataloader
sample = next(iter(dataloader))
print(sample['original'][0])
print(sample['length'][0])
print(sample['input'][0])
print(sample['output'][0])
print(sample['input'].shape, sample['output'].shape)

annette
tensor(7)
tensor([ 0, 13, 13,  4, 19, 19,  4, 26, 26, 26, 26, 26, 26])
tensor([13, 13,  4, 19, 19,  4, 26, 26, 26, 26, 26, 26, 26])
torch.Size([64, 13]) torch.Size([64, 13])


In [9]:
# Sort the input sequences based on their lengths
# This could be useful with variable lengths

total_lengths = sample['length'] # lengths of multiple names

# sort the names in the descending order, sort_idx is a set of indices of total_lengths
sort_length, sort_idx = torch.sort(total_lengths, descending=True)

# sorting sample['input'][sort_idx] and assigning it to sort_input. Thus, sort_input is a sorted list.
sort_input = sample['input'][sort_idx]
sort_output = sample['output'][sort_idx]
sort_original_name = [sample['original'][idx] for idx in sort_idx]

print(sort_original_name)

['ferdinanda', 'ferdinande', 'annamaria', 'katherina', 'hermione', 'andriana', 'cariotta', 'charlott', 'nathalie', 'ursuline', 'kourtney', 'marrissa', 'juliette', 'julienne', 'gertruda', 'lucille', 'vanessa', 'annette', 'hillary', 'tiphani', 'deirdre', 'nananne', 'theresa', 'marissa', 'daniele', 'junette', 'prissie', 'tandie', 'phylis', 'kirbie', 'meagan', 'lynnet', 'karlie', 'gunvor', 'linell', 'kriste', 'oneida', 'freida', 'nancie', 'danita', 'loreen', 'maggie', 'heidie', 'jessa', 'patsy', 'gussy', 'dulce', 'andee', 'pearl', 'vilma', 'trace', 'tilly', 'anja', 'roby', 'leah', 'nady', 'cara', 'evie', 'rory', 'bell', 'rita', 'gigi', 'flo', 'roz']


# **Define the RNN Model**

In [10]:
# Define the RNN model
class RNNmodel(nn.Module):
    def __init__(self,
                 lstm_dim=256,
                 num_classes=dataset.num_classes,
                 max_len=max_len):
        super(RNNmodel, self).__init__()

        self.lstm_dim = lstm_dim
        self.num_classes = num_classes
        self.max_len = max_len

        self.char_embedding = nn.Embedding(num_embeddings=num_classes,
                                           embedding_dim=lstm_dim)

        self.lstm = nn.LSTM(input_size=lstm_dim,
                            hidden_size=lstm_dim,
                            num_layers=1,
                            batch_first=True,
                            )

        self.out_linear = nn.Linear(lstm_dim, num_classes)

    def forward(self, sort_input):
        # lstm_input: (Batch) x (Length) x (Dimension)
        lstm_input = self.char_embedding(sort_input)

        # h refers to the hidden state, and c refers to the cell state.
        lstm_out, (h, c) = self.lstm(lstm_input)

        # out: ( batch x length x num_class)-sized tensor
        out = self.out_linear(lstm_out)

        return out

    # Function to generate a name
    def generate(self, start_char):
        generated_name = list()
        generated_name.append(start_char)

        start_order = torch.LongTensor([ord(start_char)]).to(device) - ord('a')
        start_order = start_order.reshape(1, 1)
        cnt = 0

        while cnt <= self.max_len:
            curr_embed = self.char_embedding(start_order)
            if cnt == 0:
                lstm_out, (h, c) = self.lstm(curr_embed)
            else:
                lstm_out, (h, c) = self.lstm(curr_embed, (h, c))
            out = self.out_linear(lstm_out)

            # Sample the next character using the output distribution
            sample_next = torch.distributions.Categorical(logits = out[0, 0, :]).sample().item()
            if sample_next == 26:
                break
            else:
                generated_name.append(chr(ord('a')+sample_next))
                sample_next = torch.LongTensor([sample_next]).to(device)
                start_order = sample_next.reshape(1, 1)

                cnt += 1

        return ''.join(generated_name)


# **Model Training**

In [11]:
model = RNNmodel()

# Move the model to GPU if available
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
print(model)

RNNmodel(
  (char_embedding): Embedding(27, 256)
  (lstm): LSTM(256, 256, batch_first=True)
  (out_linear): Linear(in_features=256, out_features=27, bias=True)
)


In [12]:
# Define the training function
def train(model, optimizer, sample):
    optimizer.zero_grad()
    criteria = nn.CrossEntropyLoss()

    # Extract lengths of each sequence in the batch
    total_lengths = sample['length']
    # Sort the sequences based on their lengths in descending order
    sort_length, sort_idx = torch.sort(total_lengths, descending=True)

    # Retrieve the sorted input and output sequences
    sort_input = sample['input'][sort_idx].to(device)
    sort_output = sample['output'][sort_idx].to(device)
    sort_length = sort_length.to(device)

    # Forward pass through the model
    pred = model(sort_input)
    B, L, C = pred.shape

    # Compute the loss
    # Reshape the predictions and ground truth to match the CrossEntropyLoss requirements
    curr_loss = criteria(pred.reshape(B*L, C), sort_output.reshape(B*L))

    # Backpropagation
    curr_loss.backward()
    optimizer.step()

    return curr_loss.item()

# **Run the RNN Model**

In [13]:
max_epoch = 100
for epoch in tqdm(range(max_epoch)):
    total_loss = 0.0
    for sample in dataloader:
        curr_loss = train(model, optimizer, sample)
        total_loss += curr_loss / len(dataloader)

    # Test the model by generating a name by 5 times
    generated_names = []
    for _ in range(5):
        # Randomly select a starting character within the range of lowercase English letters
        start_char = chr(np.random.randint(ord('a'), ord('z')))
        generated_names.append(model.generate(start_char))

    # Print the epoch number, training loss, and a sampled name generated by the model
    print('[EPOCH {}] TRAIN LOSS: {}, SAMPLED NAMES: {}'.format(epoch,
                                                            total_loss,
                                                            ', '.join(generated_names)))

  0%|          | 0/100 [00:00<?, ?it/s]

[EPOCH 0] TRAIN LOSS: 1.9173202132567382, SAMPLED NAMES: hzmsrdlkqoofnzpa, pyoyevkyiey, fflb, uaqyqhddtwdudzhs, gjfgqpusspwpbn
[EPOCH 1] TRAIN LOSS: 1.2702803137974856, SAMPLED NAMES: ewriie, lhcpucyma, ehuar, otxela, c
[EPOCH 2] TRAIN LOSS: 1.1425157403334594, SAMPLED NAMES: pqcgzf, ketal, saypv, gjfljneis, eotnne
[EPOCH 3] TRAIN LOSS: 1.076709693823105, SAMPLED NAMES: pmtrie, aliwe, wjelisy, tsari, kel
[EPOCH 4] TRAIN LOSS: 1.0315466660719652, SAMPLED NAMES: y, lenane, ys, gylane, bnelisa
[EPOCH 5] TRAIN LOSS: 0.997556590881103, SAMPLED NAMES: ynna, torale, yu, areemrell, trine
[EPOCH 6] TRAIN LOSS: 0.9708933295347754, SAMPLED NAMES: bwela, gi, rrica, peestie, barlia
[EPOCH 7] TRAIN LOSS: 0.9487611307547645, SAMPLED NAMES: puy, vgtene, car, xuwlie, harnssa
[EPOCH 8] TRAIN LOSS: 0.930902338180786, SAMPLED NAMES: eubeda, ywi, sthevette, mralenna, shostan
[EPOCH 9] TRAIN LOSS: 0.9156749156805188, SAMPLED NAMES: btimbie, juyne, fforien, sshanlela, innera
[EPOCH 10] TRAIN LOSS: 0.90189083