In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader, random_split
from feature_engineering import DataProcessor
from LSTM import LSTMModel
from train import ModelTrainer
import torch.nn as nn



# Hyperparameters
input_size = 100
hidden_size = 256
num_layers = 1
batch_size = 64
num_epochs = 10
learning_rate = 0.001

# Load and preprocess data
dataset_dir = '/content/data/'
data_processor = DataProcessor(dataset_dir)

# Choose to preprocess data or load saved mappings
preprocess_data = False
char_to_id_file = 'char_to_id.json'
id_to_char_file = 'id_to_char.json'

if preprocess_data:
    ids, char_to_id, id_to_char = data_processor.preprocess()
    data_processor.save_mappings(char_to_id_file, char_to_id, id_to_char_file, id_to_char)
else:
    char_to_id, id_to_char = data_processor.load_mappings(char_to_id_file, id_to_char_file)
print('Mapping loaded')
## Create Dataset sequences with pytorch
#data_processor = DataProcessor(dataset_dir)
dialogue_lines = data_processor.read_dialogue_lines()
text = ' '.join(dialogue_lines.values())
ids = data_processor.text_to_ids(text, char_to_id)
print(len(ids))
dataset = data_processor.create_dataset(ids[:1000050])



Mapping loaded
17146310


In [7]:
len(dataset)

1000000

In [9]:
from train import ModelTrainer

train_loader, val_loader, test_loader=DataProcessor.create_loaders(dataset, 0.8, 0.1, 0.1, 50)
print('Data Loaded')
## Train and Evaluate the model
# Initialize the LSTM model
input_size = embedding_size = 50
hidden_size = 256
num_layers = 2
vocab_size = len(char_to_id)
model = LSTMModel(input_size, hidden_size, vocab_size, num_layers)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the loss function, learning rate, and optimizer
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainer=ModelTrainer(model, train_loader, criterion, optimizer, device)

print('start training')
# Training loop

num_epochs = 10
for epoch in range(num_epochs):
    print(epoch)
    loss = trainer.train()
    # Evaluate the model on the validation set
    validation_loss = trainer.evaluate(val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {validation_loss:.4f}")

# Evaluation on test set
test_loss = trainer.evaluate(test_loader)


Data Loaded
start training
0
Epoch 1/10, Loss: 1.4919, Validation Loss: 1.3284
1
Epoch 2/10, Loss: 1.2815, Validation Loss: 1.2465
2
Epoch 3/10, Loss: 1.2167, Validation Loss: 1.1975
3
Epoch 4/10, Loss: 1.1737, Validation Loss: 1.1590
4
Epoch 5/10, Loss: 1.1430, Validation Loss: 1.1331
5
Epoch 6/10, Loss: 1.1204, Validation Loss: 1.1170
6
Epoch 7/10, Loss: 1.1038, Validation Loss: 1.1024
7
Epoch 8/10, Loss: 1.0910, Validation Loss: 1.0909
8
Epoch 9/10, Loss: 1.0812, Validation Loss: 1.0838
9
Epoch 10/10, Loss: 1.0734, Validation Loss: 1.0776


In [10]:
print('Loss on testing data:', test_loss)

Loss on testing data: 1.0789489044249059


In [13]:
from LSTM import LSTMModel

In [15]:
# Save model
#trainer.model.save_model('lstm_2layers_10epochs')
torch.save(trainer.model, 'lstm2layers_10epochs')

In [18]:
model=trainer.model


In [19]:
from evaluate import Evaluater
# Evaluate performance
evaluater = Evaluater(model, device)
perplexity= evaluater.calculate_perplexity(test_loader, criterion)
print('Perplexity:' , perplexity)

Perplexity: 2.9415860374933525


In [25]:
seed_text = "Start a discussion about coffee"
gen_length = 2000
temperature = 0.8

#generated_text = evaluater.generate_text(seed_text, gen_length, char_to_id, id_to_char, device, temperature)
generated_text = generate_text(model, seed_text, gen_length, char_to_id, id_to_char, device, temperature)
print(generated_text)


Start a discussion about coffee this time, Coset So and the prick for this point. What did you have no fantasy regarding the bathroom with your finds out.  Let's talk to you. You know what you mean to me. I don't be so bad as an itty thousand days. How many. What you are the one of Seah? But first, or freewilut in the arrangements. Thank you. The first thing was on his pardon? Are you gonna tell me about him. He's damn it. What about some kind of guy.  He's not finished - not so...  ...did you know? No. Baid who do you want to see Mr. Dallas. So you didn't shoot West. He did it is, ashamed. I have a finger out of here, chamber if it gets all week. I don't know... Emmoner, Dave to use a failure you going? How can you be so kind. All right, little being out there. What did you say, Carner. C'mon, we're going in this? They think they didn't work at the contemand right away? Now one. So her manualive... This does not we?  I may can't. I don't know. I was crazy, I really didn't like him by 

In [24]:
import torch.nn.functional as F

def generate_text(model, seed_text, gen_length, char_to_id, id_to_char, device, temperature=1.0):
        model.eval()

        # Convert seed_text to tensor
        input_seq = [char_to_id[char] for char in seed_text]
        input_seq = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0).to(device)  # Add batch_first dimension

        # Initialize the hidden state
        hidden = model.init_hidden(1)

        # Generate text
        generated_text = seed_text
        for _ in range(gen_length):
            with torch.no_grad():
                outputs, hidden = model(input_seq, hidden)
                char_probs = F.softmax(outputs[-1, :] / temperature, dim=0)

                # Sample a character from the output probabilities
                char_idx = torch.multinomial(char_probs, 1).item()

                # Append the generated character to the generated text
                generated_char = id_to_char[str(char_idx)]
                generated_text += generated_char

                # Update the input sequence with the generated character
                input_seq = torch.tensor([[char_idx]], dtype=torch.long).to(device)

        return generated_text