# 0. Environment Setting

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import random
import time
import math
import re
import numpy as np
import copy
import os

# Enable CUDA debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# Disable cuDNN to isolate if the issue is with cuDNN
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
TRAIN_PATH = 'Cooking_Dataset/train.csv'
DEV_PATH = 'Cooking_Dataset/dev.csv'
TEST_PATH = 'Cooking_Dataset/test.csv'
GLOVE_PATH = 'glove.6B.50d.txt'

Mounted at /content/drive/


# 1. Data Preparation

## 1.1 Read data

In [None]:
# Basic data loading and processing
def clean_text(df, columns):
    for col in columns:
        df[col] = df[col].fillna('unknown')
        df[col] = df[col].astype(str)
        df[col] = df[col].apply(lambda x: re.sub(r'\t', ' ', x))
        df[col] = df[col].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x).lower().strip())
    return df

# Load data
train_data = pd.read_csv(TRAIN_PATH)
dev_data = pd.read_csv(DEV_PATH)
test_data = pd.read_csv(TEST_PATH)

# Clean data
columns = ['Ingredients', 'Recipe']
train_data = clean_text(train_data, columns)
dev_data = clean_text(dev_data, columns)
test_data = clean_text(test_data, columns)

#Sampling
train_data_sampling = train_data.sample(frac=0.5, random_state=33)


Test DataFrame:
                                             Ingredients  \
9450   1 ea egg beaten\t13 c  sugar\t14 c  vinegar\t1...   
34585  9 oz elbow macaroni\t34 lb mild or hot italian...   
59368  1 12 c  uncle bens converted\trice\t12 c  wate...   
28309  1    whole red snapper 2 lb or whole sea bass ...   
36372  13 c  olive oil\t2 tb cider vinegar\t1    gree...   

                                                  Recipe  
9450   in small saucepan  beat egg and sugar until we...  
34585  cook the macaroni in water according to direct...  
59368  preheat oven to 350 degrees  combine ingredien...  
28309  season fish inside and out with salt and peppe...  
36372  whisk together the oil  vinegar  scallion  mus...  


## 1.2 Word2idx

In [None]:
# Basic Word2idx Vocab
def build_vocab(data):
    words = set()
    for text in pd.concat([data['Ingredients'], data['Recipe']]):
        words.update(text.split())
    word2idx = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
    for i, word in enumerate(words, 4):
        word2idx[word] = i
    return word2idx

# setting vocab and vocab size
vocab  = build_vocab(train_data_sampling)

# Inverse dictionary
idx2word = {idx: word for word, idx in vocab.items()}

print("vocab size:", len(vocab))
print({k: vocab[k] for k in list(vocab)[:20]})

Ini_vocab size: 4917
{'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3, 'arsley': 4, 'freeform': 5, 'cherry': 6, 'opptional': 7, 'pref': 8, 'blended': 9}


## 1.3 Recipe 

In [None]:
MAX_LENGTH = 150
BATCH_SIZE = 32

# Setting the recipe
class RecipeDataset(Dataset):
    def __init__(self, ingredients, recipes, vocab, max_length=MAX_LENGTH):
        self.ingredients = [self.encode(ing, vocab, max_length) for ing in ingredients]
        self.recipes = [self.encode(rec, vocab, max_length) for rec in recipes]
        self.vocab = vocab

    def encode(self, text, vocab, max_length):
        encoded = [vocab['<sos>']] + [vocab.get(word, vocab['<unk>']) for word in text.split()] + [vocab['<eos>']]
        if len(encoded) < max_length:
            encoded += [vocab['<pad>']] * (max_length - len(encoded))
        else:
            encoded = encoded[:max_length]
        return encoded

    def __len__(self):
        return len(self.ingredients)

    def __getitem__(self, idx):
        return torch.tensor(self.ingredients[idx]), torch.tensor(self.recipes[idx])


# Create dataset and loader
train_dataset = RecipeDataset(train_data_sampling['Ingredients'].tolist(), train_data_sampling['Recipe'].tolist(), vocab)
dev_dataset = RecipeDataset(dev_data['Ingredients'].tolist(), dev_data['Recipe'].tolist(), vocab)
test_dataset = RecipeDataset(test_data['Ingredients'].tolist(), test_data['Recipe'].tolist(), vocab)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 2. Baseline

## 2.1.  Baseline 1: Sequence-to-Sequence model

In [None]:
NUM_LAYER = 2

# Basic Encoder
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=NUM_LAYER, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell

# Basic Encoder and Decoder
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.output_dim = output_dim
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=NUM_LAYER, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

In [None]:
TEACHING_FORCING_RATIO = 0.5

# Basic SEQ2SEQ
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=TEACHING_FORCING_RATIO):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # Obtain encoder outputs
        encoder_outputs, hidden, cell = self.encoder(src)

        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

## 2.2. Baseline 2: Sequence-to-Sequence model with Attention

In [None]:
# Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Parameter(torch.rand(hid_dim))

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hid_dim]
        # encoder_outputs: [batch_size, src_len, hid_dim]
        src_len = encoder_outputs.shape[1]

        # Repeat hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hid_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, src_len, hid_dim]
        attention = torch.sum(self.v * energy, dim=2)  # [batch_size, src_len]

        return torch.softmax(attention, dim=1)  # [batch_size, src_len]


# Attention Decoder
class DecoderWithAttention(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, attention, dropout):
        super().__init__()
        # important
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(hid_dim + emb_dim, hid_dim, num_layers=NUM_LAYER, batch_first=True)
        # self.fc_out = nn.Linear(hid_dim + emb_dim, output_dim)
        self.fc_out = nn.Linear(hid_dim + hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)


    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.dropout(self.embedding(input))  # [batch_size, 1, emb_dim]

        a = self.attention(hidden[-1], encoder_outputs).unsqueeze(1)  # [batch_size, 1, src_len]
        weighted = torch.bmm(a, encoder_outputs)  # [batch_size, 1, hid_dim]

        rnn_input = torch.cat((embedded, weighted), dim=2)  # [batch_size, 1, emb_dim + hid_dim]

        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))  # output: [batch_size, 1, hid_dim]

        embedded = embedded.squeeze(1)  # [batch_size, emb_dim]
        output = output.squeeze(1)  # [batch_size, hid_dim]
        weighted = weighted.squeeze(1)  # [batch_size, hid_dim]

        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))  # [batch_size, output_dim]

        return prediction, hidden, cell

In [None]:
# Seq2Seq with Attention
class Seq2SeqWithAttention(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=TEACHING_FORCING_RATIO):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        return outputs

# 3. Extension

## 3.1 Extension 1: Preprocessing data <num> and frequency

In [None]:
# Clean num
def clean_num(df, columns):
    for col in columns:
        df[col] = df[col].apply(lambda x: re.sub(r'\d+', '<num>', x)) 
        # df[col] = df[col].apply(lambda x: re.sub(r'\b\d+\b', '<num>', x))
    return df

# Clean data
columns_ex = ['Ingredients', 'Recipe']
train_data_ex = clean_num(train_data_sampling, columns)
dev_data_ex = clean_num(dev_data, columns)
test_data_ex = clean_num(test_data, columns)

# Word2idx Vocab with >= 2 times vocab and att <num>
def build_vocab_ex(data, min_freq=2):
    word_freq = {}
    for text in pd.concat([data['Ingredients'], data['Recipe']]):
        for word in text.split():
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1

    # Prune vocabulary based on frequency
    word2idx = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3, "<num>": 4}
    idx = 5
    for word, freq in word_freq.items():
        if freq >= min_freq:
            word2idx[word] = idx
            idx += 1

    # Add padding token to match the expected size
    word2idx["<pad_extra>"] = len(word2idx)

    return word2idx

vocab_ex  = build_vocab_ex(train_data_ex)

# Inverse dictionary
idx2word_ex = {idx: word for word, idx in vocab_ex.items()}

print("Vocab-ex size:", len(vocab_ex))
print({k: vocab_ex[k] for k in list(vocab_ex)[:20]})

In [None]:
# Create dataset and loader
train_dataset_ex = RecipeDataset(train_data_ex['Ingredients'].tolist(), train_data_ex['Recipe'].tolist(), vocab_ex)
dev_dataset_ex = RecipeDataset(dev_data_ex['Ingredients'].tolist(), dev_data_ex['Recipe'].tolist(), vocab_ex)
test_dataset_ex = RecipeDataset(test_data_ex['Ingredients'].tolist(), test_data_ex['Recipe'].tolist(), vocab_ex)

train_loader_ex = torch.utils.data.DataLoader(train_dataset_ex, batch_size=BATCH_SIZE, shuffle=True)
dev_loader_ex = torch.utils.data.DataLoader(dev_dataset_ex, batch_size=BATCH_SIZE, shuffle=False)
test_loader_ex = torch.utils.data.DataLoader(test_dataset_ex, batch_size=BATCH_SIZE, shuffle=False)

## 3.2. Extension 2: Using Pretrained Embeddings GLoVe

In [None]:
EMBED_DIM_GLOVE = 50

def load_embeddings(vocab,  embedding_dim, glove_data):
    embeddings_index = {}
    with open(glove_data, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    vocab_size = len(vocab) # Adjust the size of the embedding matrix
    print(f"Vocabulary size: {vocab_size}")
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    for word, i in vocab.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            if i < vocab_size:  # Ensure index is within bounds
                embedding_matrix[i] = embedding_vector
            else:
                print(f"Index {i} for word {word} is out of bounds with vocab size {vocab_size}")

    embedding_matrix[-1] = np.zeros(embedding_dim) ##

    return torch.tensor(embedding_matrix, dtype=torch.float32)

pretrained_embeddings = load_embeddings(vocab_ex, EMBED_DIM_GLOVE, GLOVE_PATH)
print(f"Pretrained embedding matrix shape: {pretrained_embeddings.shape}")

In [None]:
# encoder adn decoder for extension
class EncoderWithEmbeddings(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout, embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=NUM_LAYER, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return outputs, hidden, cell

class DecoderWithAttentionAndEmbeddings(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, attention, dropout, embeddings):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.rnn = nn.LSTM(hid_dim + emb_dim, hid_dim, num_layers=NUM_LAYER, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden[-1], encoder_outputs).unsqueeze(1)
        weighted = torch.bmm(a, encoder_outputs)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        embedded = embedded.squeeze(1)
        output = output.squeeze(1)
        weighted = weighted.squeeze(1)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        return prediction, hidden, cell

In [None]:
# Seq2Seq with Attention and Pretrained Embeddings
class Seq2SeqWithAttentionAndEmbeddings(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=TEACHING_FORCING_RATIO):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs, hidden, cell = self.encoder(src)
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        return outputs

In [None]:
class Seq2SeqWithContentPlanning(nn.Module):
    def __init__(self, encoder, content_planner, sequence_generator, device):
        super().__init__()
        self.encoder = encoder
        self.content_planner = content_planner
        self.sequence_generator = sequence_generator
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=TEACHING_FORCING_RATIO):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.sequence_generator.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # Encode the input sequence
        encoder_outputs, hidden, cell = self.encoder(src) 
        encoder_outputs = encoder_outputs.mean(dim=1)
        encoder_outputs = encoder_outputs.long()
        content_plan = self.content_planner(encoder_outputs)

        # Initialize the hidden and cell states of the sequence generator
        input = trg[:, 0]
        for t in range(1, trg_len):
            output, hidden, cell = self.sequence_generator(input, hidden, cell, content_plan)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs


# 4. Train & Evaluation

## 4.1 train & evaluation function

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]

        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0)  # Turn off teacher forcing
            output_dim = output.shape[-1]

            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

## 4.2 Configurations

In [None]:
# Model & Training Configurations
INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
INPUT_DIM_EX = len(vocab_ex)
OUTPUT_DIM_EX = len(vocab_ex) 
EMB_DIM = 256
HID_DIM = 256
DROPOUT = 0.1
LEARNING_RATE = 0.001
NUM_STAGES = 5 
PATIENCE = 5
N_EPOCHS = 10
CLIP = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Model 1: Seq2Seq
encoder1 = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, DROPOUT)
decoder1 = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DROPOUT)
model1 = Seq2Seq(encoder1, decoder1, device).to(device)

optimizer1 = optim.Adam(model1.parameters(), lr=LEARNING_RATE)
criterion1 = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

# Model 2: Seq2Seq with Attention
attn2 = Attention(HID_DIM)
encoder2 = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, DROPOUT)
decoder2 = DecoderWithAttention(OUTPUT_DIM, EMB_DIM, HID_DIM, attn2, DROPOUT)
model2 = Seq2SeqWithAttention(encoder2, decoder2, device).to(device)

optimizer2 = optim.Adam(model2.parameters(), lr=LEARNING_RATE)
criterion2 = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

# Model 3: Processed Data with Attention
attn3 = Attention(HID_DIM)
encoder3 = Encoder(INPUT_DIM_EX, EMB_DIM, HID_DIM, DROPOUT)
decoder3 = DecoderWithAttention(OUTPUT_DIM_EX, EMB_DIM, HID_DIM, attn3, DROPOUT)
model3 = Seq2SeqWithAttention(encoder3, decoder3, device).to(device)

optimizer3 = optim.Adam(model3.parameters(), lr=LEARNING_RATE)
criterion3 = nn.CrossEntropyLoss(ignore_index=vocab_ex['<pad>'])

# Model 4: Embedding and Processed Data with Attention
attn4 = Attention(HID_DIM)
encoder4 = EncoderWithEmbeddings(INPUT_DIM_EX, EMBED_DIM_GLOVE, HID_DIM, DROPOUT, pretrained_embeddings)
decoder4 = DecoderWithAttentionAndEmbeddings(OUTPUT_DIM_EX, EMBED_DIM_GLOVE, HID_DIM, attn4, DROPOUT, pretrained_embeddings)
model4 = Seq2SeqWithAttentionAndEmbeddings(encoder4, decoder4, device).to(device)

optimizer4 = optim.Adam(model4.parameters(), lr=LEARNING_RATE)
criterion4 = nn.CrossEntropyLoss(ignore_index=vocab_ex['<pad>'])

In [None]:
class EarlyStopping:
    def __init__(self, patience=PATIENCE, verbose=False, delta=0.01):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
import pickle

models = [model1, model2, model3, model4, model5]
train_losses = [[] for _ in range(len(models))]
valid_losses = [[] for _ in range(len(models))]
training_times = [0 for _ in range(len(models))]  # Initialize with zeros

# Early stopping initialization
early_stoppings = [EarlyStopping(patience=PATIENCE, verbose=True) for _ in range(len(models))]


# Training each model with early stopping and timing
for i, model in enumerate(models):
# start_model_idx = 3  # Start from model 3 (index 2)
# for i in range(start_model_idx, len(models)):
#     model = models[i] ########
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'] if i < 2 else vocab_ex['<pad>'])
    train_loader_current = train_loader if i < 2 else train_loader_ex
    dev_loader_current = dev_loader if i < 2 else dev_loader_ex

    # Initialize total training time for the current model
    training_time = 0

    for epoch in range(N_EPOCHS):
        start_time = time.time()

        train_loss = train(model, train_loader_current, optimizer, criterion, CLIP)
        valid_loss = evaluate(model, dev_loader_current, criterion)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        # Accumulate total training time
        training_time += (end_time - start_time)

        train_losses[i].append(train_loss)
        valid_losses[i].append(valid_loss)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

        early_stoppings[i](valid_loss, model)

        if early_stoppings[i].early_stop:
            print("Early stopping")
            break

    # Save the final model
    torch.save(model.state_dict(), f'model{i+1}.pth')

    # Store the training time for the current model
    training_times.append(training_time)
    total_mins, total_secs = epoch_time(0, training_time)
    print(f'Total training time for model {i+1}: {total_mins}m {total_secs}s')

    # Save the losses and training times to files after each model is trained
    with open(f'train_losses_model{i+1}.pkl', 'wb') as f:
        pickle.dump(train_losses[i], f)

    with open(f'valid_losses_model{i+1}.pkl', 'wb') as f:
        pickle.dump(valid_losses[i], f)

    with open(f'training_time_model{i+1}.pkl', 'wb') as f:
        pickle.dump(training_times[i], f)

# 4.5 loading

In [None]:
# import pickle

# model1 = Seq2Seq(encoder1, decoder1, device).to(device)
# model2 = Seq2SeqWithAttention(encoder2, decoder2, device).to(device)
# model3 = Seq2SeqWithAttention(encoder3, decoder3, device).to(device)
# model4 = Seq2SeqWithAttentionAndEmbeddings(encoder4, decoder4, device).to(device)

# model1.load_state_dict(torch.load('model1.pth', map_location=torch.device('cpu')))
# model2.load_state_dict(torch.load('model2.pth', map_location=torch.device('cpu')))
# model3.load_state_dict(torch.load('model3.pth', map_location=torch.device('cpu')))
# model4.load_state_dict(torch.load('model4.pth', map_location=torch.device('cpu')))

# #List of models
# models = [model1, model2, model3, model4, model5]

# # Reading train_losses
# train_losses = []
# for i in range(len(models)):
#     with open(f'train_losses_model{i+1}.pkl', 'rb') as f:
#         train_losses.append(pickle.load(f))

# # Reading valid_losses
# valid_losses = []
# for i in range(len(models)):
#     with open(f'valid_losses_model{i+1}.pkl', 'rb') as f:
#         valid_losses.append(pickle.load(f))

# # Reading training_times
# training_times = []
# for i in range(len(models)):
#     with open(f'training_time_model{i+1}.pkl', 'rb') as f:
#         training_times.append(pickle.load(f))

# 5. Analyze

## 5.1 Statistic

In [None]:
# compute statistics
def compute_statistics(data, vocab, special_tokens):
    num_samples = len(data)
    vocab_size = len(vocab) - len(special_tokens)
    
    ingredients_lengths = data['Ingredients'].apply(lambda x: len([word for word in x.split() if word not in special_tokens]))
    recipes_lengths = data['Recipe'].apply(lambda x: len([word for word in x.split() if word not in special_tokens]))
    
    ingredients_stats = {
        'num_samples': num_samples,
        'vocab_size': vocab_size,
        'min_length': ingredients_lengths.min(),
        'max_length': ingredients_lengths.max(),
        'avg_length': ingredients_lengths.mean()
    }
    
    recipes_stats = {
        'num_samples': num_samples,
        'vocab_size': vocab_size,
        'min_length': recipes_lengths.min(),
        'max_length': recipes_lengths.max(),
        'avg_length': recipes_lengths.mean()
    }
    
    return ingredients_stats, recipes_stats

# Special tokens
special_tokens_standard = {'<pad>', '<sos>', '<eos>', '<unk>'}
special_tokens_extended = {'<pad>', '<sos>', '<eos>', '<unk>', '<num>'}

# Compute statistics for both datasets
train_stats_ing, train_stats_rec = compute_statistics(train_data, vocab, special_tokens_standard)
dev_stats_ing, dev_stats_rec = compute_statistics(dev_data, vocab, special_tokens_standard)
test_stats_ing, test_stats_rec = compute_statistics(test_data, vocab, special_tokens_standard)

train_stats_ing_ex, train_stats_rec_ex = compute_statistics(train_data_ex, vocab_ex, special_tokens_extended)
dev_stats_ing_ex, dev_stats_rec_ex = compute_statistics(dev_data_ex, vocab_ex, special_tokens_extended)
test_stats_ing_ex, test_stats_rec_ex = compute_statistics(test_data_ex, vocab_ex, special_tokens_extended)

# Function to prepare statistics for table
def statistics_table(stats, dataset_name):
    table_data = []
    for category in ['Ingredients', 'Recipes']:
        category_stats = stats[category]
        table_data.append([
            dataset_name, 
            category, 
            category_stats['num_samples'], 
            category_stats['vocab_size'], 
            category_stats['min_length'], 
            category_stats['max_length'], 
            category_stats['avg_length']
        ])
    return table_data

# Gather all statistics in a list of lists
all_stats = []
all_stats.extend(statistics_table({'Ingredients': train_stats_ing, 'Recipes': train_stats_rec}, 'Training Data'))
all_stats.extend(statistics_table({'Ingredients': dev_stats_ing, 'Recipes': dev_stats_rec}, 'Dev Data'))
all_stats.extend(statistics_table({'Ingredients': test_stats_ing, 'Recipes': test_stats_rec}, 'Test Data'))
all_stats.extend(statistics_table({'Ingredients': train_stats_ing_ex, 'Recipes': train_stats_rec_ex}, 'Training Data (Ex)'))
all_stats.extend(statistics_table({'Ingredients': dev_stats_ing_ex, 'Recipes': dev_stats_rec_ex}, 'Dev Data (Ex)'))
all_stats.extend(statistics_table({'Ingredients': test_stats_ing_ex, 'Recipes': test_stats_rec_ex}, 'Test Data (Ex)'))

# Create a DataFrame for pretty printing
columns = ['Dataset', 'Category', 'Num Samples', 'Vocab Size', 'Min Length', 'Max Length', 'Avg Length']
stats_df = pd.DataFrame(all_stats, columns=columns)

print(stats_df.to_string(index=False))

## 5.2 Diagram Analysis

In [None]:
import matplotlib.pyplot as plt

model_names = ['Baseline 1', 'Baseline 2', 'Extension 1', 'Extension 2']
colors = ['blue', 'red', 'green', 'purple']
linestyles = ['solid', 'dashed']

plt.figure(figsize=(10, 6))

for i, (model, color) in enumerate(zip(model_names, colors)):
    plt.plot(train_losses[i], color=color, linestyle=linestyles[0], label=f'{model} - Training')
    plt.plot(valid_losses[i], color=color, linestyle=linestyles[1], label=f'{model} - Validation')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

## 5.3

In [None]:
%pip install nltk

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

def avg_percentage_given_items(ingredients, generated):
    given_items = set(ingredients)
    gen_items = set(generated)
    common_items = given_items.intersection(gen_items)
    return len(common_items) / len(given_items) * 100

def avg_extra_items(ingredients, generated):
    given_items = set(ingredients)
    gen_items = set(generated)
    extra_items = gen_items.difference(given_items)
    return len(extra_items)

In [None]:
def calculate_metrics(reference, candidate):
    bleu_4 = sentence_bleu([reference], candidate, weights=(0.25, 0.25, 0.25, 0.25))
    meteor = meteor_score([reference], candidate)

    reference_ingredients = [word[1:-1] for word in reference if word.startswith('<') and word.endswith('>')]
    candidate_ingredients = [word[1:-1] for word in candidate if word.startswith('<') and word.endswith('>')]
    
    common_ingredients = set(reference_ingredients).intersection(set(candidate_ingredients))
    avg_given = len(common_ingredients) / len(reference_ingredients) if reference_ingredients else 0
    avg_extra = len(candidate_ingredients) - len(common_ingredients)
    
    return bleu_4, meteor, avg_given, avg_extra

test_results = []

for i, model in enumerate(models):
    model.eval()
    references = []
    candidates = []

    with torch.no_grad():
        test_loader_current = test_loader if i < 2 else test_loader_ex
        idx2word_current = idx2word if i < 2 else idx2word_ex
        vocab_current = vocab if i < 2 else vocab_ex

        for batch in test_loader_current:
            src, trg = batch[0].to(device), batch[1].to(device)
            output = model(src, trg, 0)  # Turn off teacher forcing
            output = output.argmax(dim=-1)

            for j in range(trg.size(0)):
                ref = trg[j].cpu().numpy()
                cand = output[j].cpu().numpy()

                ref_text = [idx2word_current[idx] for idx in ref if idx not in {vocab_current['<pad>'], vocab_current['<sos>'], vocab_current['<eos>'], vocab_current.get('<num>', -1)}]
                cand_text = [idx2word_current[idx] for idx in cand if idx not in {vocab_current['<pad>'], vocab_current['<sos>'], vocab_current['<eos>'], vocab_current.get('<num>', -1)}]

                references.append(ref_text)
                candidates.append(cand_text)

    bleu_4_scores = []
    meteor_scores = []
    avg_given_scores = []
    avg_extra_scores = []

    for ref, cand in zip(references, candidates):
        bleu_4, meteor, avg_given, avg_extra = calculate_metrics(ref, cand)
        bleu_4_scores.append(bleu_4)
        meteor_scores.append(meteor)
        avg_given_scores.append(avg_given)
        avg_extra_scores.append(avg_extra)

    test_results.append({
        "BLEU-4": np.mean(bleu_4_scores),
        "METEOR": np.mean(meteor_scores),
        "Avg. % given items": np.mean(avg_given_scores),
        "Avg. extra items": np.mean(avg_extra_scores)
    })

# Convert to DataFrame for easier presentation
results_df = pd.DataFrame(test_results, index=["Baseline 1", "Baseline 2", "Extension 1", "Extension 2", "Content Plan"])
results_df = results_df.round(10)
print(results_df)

In [None]:
# gold sample
ingredients = "2 c sugar, 1/4 c lemon juice, 1 c water, 1/3 c orange juice, 8 c strawberries"
gold_recipe = "combine <sugar> and <water> in medium saucepan . Heat , stirring , until <sugar> dissolves , then boil 5 minutes . cool . force <strawberries> through food mill or blend in blender or food processor .  strain to remove seeds , if desired . blend the puree and <lemon juice> and <orange juice> into syrup .  pour into freezer trays and freeze . remove from freezer 20 minutes before serving . turn into bowl and stir until smooth ."
generated_recipe = "Combine <sugar> and <water> in a medium saucepan . Heat, stirring, until <sugar> dissolves . Bring to a boil and let simmer for 5 minutes . Remove from heat and allow to cool . In a blender or food processor , combine <strawberries> and <cantaloupe> . Blend until smooth . Strain the mixture to remove any seeds and fibers, if desired. Stir the puree into the cooled syrup along with the <lemon juice> and <orange juice> . Pour the mixture into a large bowl and gently fold in the <vanilla ice cream> until well mixed . Freeze in a container for at least 4 hours . Before serving , let it sit at room temperature for 20 minutes to soften . Stir well to achieve a smooth consistency and serve chilled ."

# Tokenize the gold and generated recipes
gold_recipe_tokens = gold_recipe.split()
generated_recipe_tokens = generated_recipe.split()

# Calculate the metrics
sample_results = calculate_metrics(gold_recipe_tokens, generated_recipe_tokens)

# Create a DataFrame for the sample results
sample_results_df = pd.DataFrame([sample_results], columns=["BLEU-4", "METEOR", "Avg. % given items", "Avg. extra items"], index=["Sample"])
sample_results_df = sample_results_df.round(10)
print(sample_results_df)

## 5.4 recipe

In [None]:
def generate_recipe(model, ingredient, vocab, idx2word, max_length=MAX_LENGTH):
    model.eval()
    encoded_ingredient = [vocab.get(word, vocab['<unk>']) for word in ingredient.split()] + [vocab['<eos>']]
    ingredient_tensor = torch.tensor([encoded_ingredient], dtype=torch.long).to(device)

    with torch.no_grad():
        # General Seq2Seq model
        output = model(ingredient_tensor, torch.zeros_like(ingredient_tensor).to(device), 0)  # No teacher forcing
        output = output.squeeze(0).argmax(dim=1)
        recipe = ' '.join([idx2word[idx.item()] for idx in output if idx.item() != vocab['<pad>'] and idx.item() != vocab['<eos>']])
        return recipe

# Sample ingredient list
ingredients_sample = "2 c sugar, 1/4 c lemon juice, 1 c water, 1/3 c orange juice, 8 c strawberries"

# Generate recipes using all models
generated_recipes = []

for i, model in enumerate(models):
    if i < 2:
        recipe = generate_recipe(model, ingredients_sample, vocab, idx2word)
    else:
        recipe = generate_recipe(model, ingredients_sample, vocab_ex, idx2word_ex)
    generated_recipes.append(recipe)

# Create a DataFrame to store the results
qualitative_results = pd.DataFrame({
    "Model": ["Baseline 1", "Baseline 2", "Extension 1", "Extension 2", "Content Plan"],
    "Generated Recipe": generated_recipes
})


print("Qualitative Samples from all models on the given ingredient list:")
print(qualitative_results)

# Save the results to a CSV file
qualitative_results.to_csv("generated_recipe.csv", index=False)

# Discuss the results
for i, row in qualitative_results.iterrows():
    print(f"\nModel: {row['Model']}")
    print(f"Generated Recipe: {row['Generated Recipe']}")

print("Qualitative Samples from all models on the given ingredient list:")
print(qualitative_results)