## T5-Recipe-Generation Model

### Importing Libraries

In [None]:
import json
import os
import glob
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, get_scheduler
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

### Dataset for the recipe data

In [None]:
class RecipeDataset(Dataset):
    """Summary: Dataset for the recipe data"""
    def __init__(
        self,
        data: pd.DataFrame,
        mode,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 1000,
    ):
        """Summary: Constructor for the RecipeDataset class
        Parameters:
            data (pd.DataFrame): Dataframe containing the recipe data
            mode (str): Mode of the dataset (train, val, test)
            tokenizer (T5Tokenizer): Tokenizer for the T5 model
            text_max_token_len (int): Maximum length of the input text
            summary_max_token_len (int): Maximum length of the output summary
        """
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
        self.mode = mode
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        """Summary: Returns a dictionary containing the input and output encodings
        Parameters:
            index (int): Index of the data row to be returned
            Returns:
                return_dict (dict): Dictionary containing the input and output encodings
        """
        data_row = self.data.iloc[index]
        text = data_row['input']
        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            data_row['output'],
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return_dict = dict(
            input_ids=text_encoding['input_ids'].flatten(),
            attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            decoder_attention_mask=summary_encoding['attention_mask'].flatten()
        )
        
        if self.mode == 'val' or self.mode == 'test':
            return_dict['id'] = data_row['id']
            return_dict['title'] = data_row['title']
            return_dict['ingredients'] = data_row['ingredients']
            return_dict['recipe'] = data_row['recipe']

        return return_dict


### Load the data

In [None]:
def load_data(tokenizer):
    """Summary: Load data from JSON files and create train, and val dataloaders
    Args:
        tokenizer (T5Tokenizer): T5 tokenizer
        Returns:
        train_dataloader (DataLoader): train dataloader
        val_dataloader (DataLoader): val dataloader
    """
    #extract data from JSON files
    with open('ingrs.json') as json_file:
        ingredients_dictionary = json.load(json_file)
        
    with open('layer.json') as json_file:
        layer_dictionary = json.load(json_file)

    ingredients_by_id = {}
    for item in ingredients_dictionary:
        id = item['id']
        ingredients = ', '.join([x['text'] for x in item['ingredients']])
        ingredients_by_id[id] = ingredients
    
    train_dicts = []
    val_dicts = []
    # test_dicts = []
    for item in layer_dictionary:
        temp_dict = {}
        temp_dict['id'] = item['id']
        temp_dict['title'] = item['title']
        temp_dict['ingredients'] = ingredients_by_id[item['id']]
        temp_dict['recipe'] = ' \n'.join([x['text'] for x in item['instructions']])
        temp_dict['output'] = f"Title: {item['title']} \nRecipe: {temp_dict['recipe']}"
        temp_dict['input'] = f"Title: {item['title']} \nIngredients: {temp_dict['ingredients']}"
        if item['partition'] == 'train':
            temp_dict['input'] += f"\nIngredients with quantity: {', '.join([x['text'] for x in item['ingredients']])}"
            train_dicts.append(temp_dict)
        elif item['partition'] == 'val':
            val_dicts.append(temp_dict)

    train_df = pd.DataFrame(train_dicts)
    # test_df = pd.DataFrame(test_dicts)
    val_df = pd.DataFrame(val_dicts)
    print(len(train_df), len(val_df))

    train_dataset = RecipeDataset(data = train_df, mode = 'train', tokenizer = tokenizer)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=12)

    # test_dataset = RecipeDataset(data = test_df, mode = 'test', tokenizer = tokenizer)
    # test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=12)

    val_dataset = RecipeDataset(data = val_df, mode = 'val', tokenizer = tokenizer)
    val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=12)

    return train_dataloader, val_dataloader


### Load the model and tokenizer

In [None]:

def load_model_tokenizer():
    """Summary: Load the model and tokenizer
    Returns:
        model (T5ForConditionalGeneration): T5 model
        tokenizer (T5Tokenizer): T5 tokenizer
        device (str): Device to be used for training
    """
    
    # initialize the tokenizer and model
    # torch.cuda.empty_cache()
    
    device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
    print(device)    

    device_map = "auto"

    kwargs = dict(
        device_map=device_map
    )
    
    tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=100)
    model = T5ForConditionalGeneration.from_pretrained("t5-base", **kwargs)
    
    
    config = T5Config(
    vocab_size = tokenizer.vocab_size,
    pad_token_id = tokenizer.pad_token_id,
    eos_token_id = tokenizer.eos_token_id,
    decoder_start_token_id = tokenizer.pad_token_id,
    d_model = 300
    )

    model = T5ForConditionalGeneration(config)

    sum(p.numel() for p in model.parameters() if p.requires_grad) / 1_000_000


    return model, tokenizer, device


### Train the model

In [None]:
def train(model, device, train_dataloader, val_dataloader):
    """Summary: Train the model
    Args:
        model (T5ForConditionalGeneration): T5 model
        device (str): Device to be used for training
        train_dataloader (DataLoader): train dataloader
        val_dataloader (DataLoader): val dataloader
    Returns:
        model (T5ForConditionalGeneration): Trained T5 model
        train_loss_history (list): List of training losses
        val_loss_history (list): List of validation losses
    """
    num_epochs = 30
    num_training_steps = num_epochs * len(train_dataloader)
    progress_bar = tqdm(range(num_training_steps))


    minLoss = 10000000
    optimizer = AdamW(model.parameters(), lr = 3e-4)

    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    model = torch.nn.DataParallel(model)
    model = model.to(device)
    train_loss_history = []
    val_loss_history = []
    for epoch in range(num_epochs):
        print(f'epoch: {epoch + 1}')
        train_loss = 0
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            # logits = outputs.logits
            t_loss = torch.sum(outputs.loss)
            train_loss += t_loss.item()
            t_loss.backward()
            
            optimizer.step()
            lr_scheduler.step()
            
            optimizer.zero_grad()
            progress_bar.update()

        
        train_loss /= len(train_dataloader)
        train_loss_history.append(train_loss)
        print(f"Avg training loss = {train_loss}")
        #validate
        model.eval()
        with torch.no_grad():
            val_loss = 0
            for batch in tqdm(val_dataloader):
                forward_keys = {key: batch[key].to(device) for key in ["input_ids", "attention_mask", "labels", "decoder_attention_mask"]}
                outputs = model(**forward_keys)
                # logits = outputs.logits
                v_loss = torch.sum(outputs.loss)
                val_loss += v_loss.item()
            val_loss /= len(val_dataloader)
            print(f"Avg validation loss = {val_loss}")
            val_loss_history.append(val_loss)
            
        if val_loss < minLoss:
            print("model saved with val_loss = ", val_loss)
            torch.save(model.state_dict(), "t5_recipe_jun_13_best.pt")
            minLoss = val_loss
        torch.save(model.state_dict(), f"t5_recipe_jun_13_epoch_{epoch}.pt")
        

    
    plt.plot(train_loss_history, label='train_loss')
    plt.plot(val_loss_history,label='val_loss')
    plt.legend()
    plt.show
    plt.savefig('loss_graph.png')
    return model, train_loss_history, val_loss_history


### Load best model

In [None]:
def load_best_model(model, device, path = 't5_recipe_jun_13.pt'):
    #LOADING MODEL
    """Summary: Load the best model
    Args:
        model (T5ForConditionalGeneration): T5 model
        device (str): Device to be used for training
        path (str): Path to the best model
    Returns:
        model (T5ForConditionalGeneration): Trained T5 model
    """
    model = torch.nn.DataParallel(model)
    model = model.to(device)
    model.load_state_dict(torch.load(path))
    return model


### Main function

In [None]:

def main():
    model, tokenizer, device = load_model_tokenizer()
    # model = load_best_model(model, device)

    train_dataloader, val_dataloader = load_data(tokenizer)
    model, train_loss_history, val_loss_history = train(model, device, train_dataloader, val_dataloader)

    print(f"Train Loss History: {train_loss_history}")
    print(f"Val Loss History: {val_loss_history}")
    print("Training complete!")


main()