In [1]:
pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [12]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/startTalking_main")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
from data_loader.data_loaders import ShakespearePlaysLoader
from models.models import LSTMModel, RNNModel
from trainers.trainer import ModelTrainer
import torch.nn as nn
from tqdm import tqdm
#from test import Evaluater
from utils.augmenter import TextAugmenter
from utils.test import Evaluater

In [14]:
# Hyperparameters
input_size = 100
hidden_size = 256
embedding_size = 100
num_layers = 1
batch_size = 64
num_epochs = 2
learning_rate = 0.001

In [15]:
# Load and preprocess data
dataset_dir = './data/ShakespearePlays/'
data_processor_char = ShakespearePlaysLoader(dataset_dir, level='char')


Reading Shakespeare Plays...


In [None]:
text_char_ids, char_to_id, id_to_char, vocab_size = data_processor_char.preprocess_char_level()
dataset = data_processor_char.create_dataset(text_char_ids[:100050])
train_loader, val_loader, test_loader = data_processor_char.create_loaders(dataset, 0.8, 0.1, 50)

print("data loaded")

Preprocessing data...
Building character vocabulary...
Creating dataset...


100%|██████████| 100000/100000 [00:00<00:00, 143585.18it/s]


data loaded


## Train LSTM

In [None]:
# Initialize the LSTM model with an embedding layer
model = LSTMModel(input_size, hidden_size, vocab_size, num_layers, embedding_size)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function, learning rate, and optimizer
criterion = nn.CrossEntropyLoss()  # Ignore padding tokens
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainer = ModelTrainer(model, train_loader, criterion, optimizer, device)

# Training loop
for epoch in range(num_epochs):
    print("starting epoch", epoch+1)
    loss = trainer.train()
    validation_loss = trainer.evaluate(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {validation_loss:.4f}")

starting epoch 1


100%|██████████| 1600/1600 [04:55<00:00,  5.41it/s]


Epoch 1/2, Loss: 1.5986, Validation Loss: 1.2587
starting epoch 2


100%|██████████| 1600/1600 [04:57<00:00,  5.38it/s]


Epoch 2/2, Loss: 1.0823, Validation Loss: 0.9282


In [None]:
## Evaluate the model

test_loss = trainer.evaluate(test_loader)
print(f"After 2 Epochs of Training, Testing Loss is: {test_loss:.4f}")

After 2 Epochs of Training, Testing Loss is: 0.9253


In [None]:


evaluater = Evaluater(model, device)

perplexity = evaluater.calculate_perplexity(test_loader, criterion)
print('Perplexity:', perplexity)

seed_text = "Start a discussion about coffee"
gen_length = 2000

generated_text = evaluater.generate_text(seed_text, gen_length, char_to_id, id_to_char, 'char', device, temperature=1, top_p=0)
print(generated_text)

Perplexity: 2.522712044019018
Start a discussion about coffees. Ay: but you look son?

First Senator:
Ay, heart-planes?

CORIOLANUS:
It in things at on
were used my country: beyond wish him, ever
sine antrict in cholens greeded, I trop'd the oak ow there?
Of Cominius!
Leat's dear the people confound, in his country;
For they now newe have, and life!
All what should the high off o'er-power of his to:
him preciustion; which else we have ment, there
but therefort what he vile their confirs.

AEdile:
If any wholedield;
Ence did with your people! Hoo! he's will.

MENENIUS:
Sir, so, welcome put
The heart of Cuill be soothing membends?

Senators, &C:oo mockery: the Volsces
With cushions we have people,
And venough mory points without as done,
I'll hath me-bretles tongues that love those these have veel
your honour of their oncers; that is not some poor
singrous countryes for the Volsces, fluind?
Now stablish yet flag, that authy; he gates, years, Marcius,
Away, speak, I will not so speak or n

## Train RNN

In [None]:
# Initialize the LSTM model with an embedding layer
model = RNNModel(input_size, hidden_size, vocab_size, num_layers, embedding_size)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function, learning rate, and optimizer
criterion = nn.CrossEntropyLoss()  # Ignore padding tokens
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainer = ModelTrainer(model, train_loader, criterion, optimizer, device, number_states=1)

# Training loop
for epoch in range(num_epochs):
    print("starting epoch", epoch+1)
    loss = trainer.train()
    validation_loss = trainer.evaluate(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {validation_loss:.4f}")

starting epoch 1


100%|██████████| 1600/1600 [01:24<00:00, 18.89it/s]


Epoch 1/2, Loss: 1.5939, Validation Loss: 1.3047
starting epoch 2


100%|██████████| 1600/1600 [01:21<00:00, 19.66it/s]


Epoch 2/2, Loss: 1.1873, Validation Loss: 1.0943


In [None]:
# Initialize the LSTM model with an embedding layer
model = RNNModel(input_size, hidden_size, vocab_size, num_layers, embedding_size)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function, learning rate, and optimizer
criterion = nn.CrossEntropyLoss()  # Ignore padding tokens
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainer = ModelTrainer(model, train_loader, criterion, optimizer, device, number_states=1)

# Training loop
for epoch in range(num_epochs):
    print("starting epoch", epoch+1)
    loss = trainer.train()
    validation_loss = trainer.evaluate(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {validation_loss:.4f}")

starting epoch 1


100%|██████████| 1600/1600 [00:06<00:00, 242.02it/s]


Epoch 1/2, Loss: 1.5916, Validation Loss: 1.3084
starting epoch 2


100%|██████████| 1600/1600 [00:05<00:00, 314.78it/s]


Epoch 2/2, Loss: 1.1906, Validation Loss: 1.0951


In [None]:
## Evaluate the model

test_loss = trainer.evaluate(test_loader)
print(f"After 2 Epochs of Training, Testing Loss is: {test_loss:.4f}")

After 2 Epochs of Training, Testing Loss is: 1.0937


In [None]:
evaluater = Evaluater(model, device, number_states=1)

perplexity = evaluater.calculate_perplexity(test_loader, criterion)
print('Perplexity:', perplexity)

seed_text = "Start a discussion about coffee"
gen_length = 2000

generated_text = evaluater.generate_text(seed_text, gen_length, char_to_id, id_to_char, 'char', device, temperature=1, top_p=0)
print(generated_text)

Perplexity: 2.9851783948402133
Start a discussion about coffee him be
Catuses:
Fas no more of that: ever sunger
hofest you? or us than they shall as it was with senace, have your gora wonders.

AUFIDIUS:
Condent-blied against ought follow to beg oaker.

CORIOLANUS:
And tradume have reined,
As we shall go petinius, he would and Tace: and be rotberse spions him jagany, in every minnter's
He his brain'd friends; we'll hear, my distabuol, see his reace, hey the fliers,
troth, in 'tis his find you; heals, there's at his country
you grows I stand flator:
Nather senator:
Agave him on
To our hearm at one poor of yiuld invines,
Let fough actions: tale,
I may, all his one speak.
Come, lack'd of, aVAls to threess to Rome my 'lforthits; and basting,
The blood
Their bay, I would unders,
Those clught; when, strong
And from he came modest gillip of thy absenter reation'd of hone, lets
He's say your voices more nettle stum' than dous blood won.

MENENIUS:
Thy foon,' fixed anot the greater: you have li

# Word Level


In [16]:
data_processor_word = ShakespearePlaysLoader(dataset_dir, level='word')

Reading Shakespeare Plays...


In [17]:
text_word_ids, embedding_matrix, vocab_size = data_processor_word.preprocess_word_level()
dataset = data_processor_word.create_dataset(text_word_ids[:1050])
train_loader, val_loader, test_loader = data_processor_word.create_loaders(dataset, 0.8, 0.1, 50)

print("data loaded")

Preprocessing data...
Building word vocabulary...
Creating dataset...


100%|██████████| 1000/1000 [00:00<00:00, 19563.35it/s]

data loaded





In [18]:
data_processor_word.id_to_word['1087']

'here'

## Train RNN

In [10]:
len(text_word_ids)

202651

In [11]:
embedding_matrix.shape

torch.Size([25672, 300])

In [19]:
# Initialize the LSTM model with an embedding layer
model = RNNModel(input_size, hidden_size, vocab_size, num_layers, embedding_matrix.size()[1], embedding_matrix, level='word')

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function, learning rate, and optimizer
criterion = nn.CrossEntropyLoss()  # Ignore padding tokens
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainer = ModelTrainer(model, train_loader, criterion, optimizer, device, number_states=1)

# Training loop
for epoch in range(num_epochs):
    print("starting epoch", epoch+1)
    loss = trainer.train()
    validation_loss = trainer.evaluate(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {validation_loss:.4f}")

starting epoch 1


100%|██████████| 16/16 [00:48<00:00,  3.00s/it]


Epoch 1/2, Loss: 8.2636, Validation Loss: 6.1227
starting epoch 2


100%|██████████| 16/16 [00:46<00:00,  2.89s/it]


Epoch 2/2, Loss: 5.9401, Validation Loss: 5.8611


In [20]:
## Evaluate the model

test_loss = trainer.evaluate(test_loader)
print(f"After 2 Epochs of Training, Testing Loss is: {test_loss:.4f}")

After 2 Epochs of Training, Testing Loss is: 5.8695


In [23]:
evaluater = Evaluater(model, device, number_states=1)

perplexity = evaluater.calculate_perplexity(test_loader, criterion)
print('Perplexity:', perplexity)

seed_text = "Start a discussion about coffee"
gen_length = 100

generated_text = evaluater.generate_text(seed_text, gen_length, data_processor_word.word_to_id, data_processor_word.id_to_word, 'word', device, temperature=1, top_p=0)
print(generated_text)

Perplexity: 354.0629242404186
Start a discussion about coffee if Who it belly's way we Consider Citizen: it too. What a patricians store-house helps them, you. for Either this show in is they A whole what more maliciously. But, thus remember, especially First stale were other That viand, receipt; up, shall being state, you The loved As rash in they if altitude a patricians the As you'll rich, come. with his and Citizen: come. thus members, mother well is, Citizen: thus must shall intend a lungs, of MENENIUS: Citizen: Citizen: which to Our poor their we speak, to As have First that is in they Appear shouts speak, MENENIUS:


In [None]:
# Initialize the LSTM model with an embedding layer
model = RNNModel(input_size, hidden_size, vocab_size, num_layers, embedding_matrix.size()[1], embedding_matrix, level='word')

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function, learning rate, and optimizer
criterion = nn.CrossEntropyLoss()  # Ignore padding tokens
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainer = ModelTrainer(model, train_loader, criterion, optimizer, device, number_states=1)

# Training loop
for epoch in range(num_epochs):
    print("starting epoch", epoch+1)
    loss = trainer.train()
    validation_loss = trainer.evaluate(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {validation_loss:.4f}")

starting epoch 1


100%|██████████| 1600/1600 [1:15:08<00:00,  2.82s/it]


Epoch 1/2, Loss: 4.7133, Validation Loss: 1.8526
starting epoch 2


100%|██████████| 1600/1600 [1:13:40<00:00,  2.76s/it]


Epoch 2/2, Loss: 0.9299, Validation Loss: 0.4809


In [None]:
## Evaluate the model

test_loss = trainer.evaluate(test_loader)
print(f"After 2 Epochs of Training, Testing Loss is: {test_loss:.4f}")

After 2 Epochs of Training, Testing Loss is: 0.4830


In [None]:
evaluater = Evaluater(model, device, number_states=1)

perplexity = evaluater.calculate_perplexity(test_loader, criterion)
print('Perplexity:', perplexity)

seed_text = "Start a discussion about coffee"
gen_length = 50

generated_text = evaluater.generate_text(seed_text, gen_length, data_processor_word.word_to_id, data_processor_word.id_to_word, 'word', device, temperature=1, top_p=0)
print(generated_text)

Perplexity: 1.620922608377538


KeyError: ignored

# Byte Pair Encoding

# Data Augmentation