**Downloading, Installing & Importing Required Libraries**

In [None]:
import os
import h5py
import math
import torch
from torch.utils.data import Dataset

In [None]:
!pip install transformers

In [None]:
from transformers import (
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizer,
    Trainer,
    TrainingArguments,
    set_seed,
    TrainerCallback
)

**Mounting Google Drive for importing the Data Files which will be used in the Tokenization**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Selecting the GPU to Train the Model**

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["CUDA_LAUNCH_BLOCKING"]="0"

**Defining the Method that will Create the Pytorch Compatible Dataset Class** 

In [None]:
class H5Dataset(Dataset):
    def __init__(self, tokenizer, file_path='/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/train_temp', block_size=512): 
        cached_features_file = "/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_temp.h5"

        # logger.info("Loading features from cached file %s", cached_features_file)
        print(("Loading features from cached file %s", cached_features_file))
        with h5py.File(cached_features_file, 'r') as f:
            if file_path=='/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/test_temp':
                self.samples = f[file_path][:] #this is a dev set, 30% of a test set
            else:
                self.samples = f[file_path][:]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return torch.tensor(self.samples[item]) 

In [None]:
def get_dataset( tokenizer, evaluate=False, local_rank=-1):
  file_path = "/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/test_temp" if evaluate else "/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/train_temp"
  return H5Dataset(tokenizer=tokenizer, file_path=file_path)

**Performing Transformer Configuration**

In [None]:
config = AutoConfig.from_pretrained('gpt2', cache_dir='cache')
set_seed(20)

**Defining the Tokenizer for the Model Training**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2', cache_dir= 'cache')

**Initialising the GPT2 Model**

In [None]:
model = AutoModelWithLMHead.from_pretrained('gpt2',config=config,cache_dir='cache',)

**Adding the Special Recipe Token to the Tokenizer**

In [None]:
special_tokens = {
    "additional_special_tokens": ['<RECIPE_START>',
                                  '<INPUT_START>',
                                  '<NEXT_INPUT>',
                                  '<INPUT_END>',
                                  '<INGR_START>',
                                  '<NEXT_INGR>',
                                  '<INGR_END>',
                                  '<INSTR_START>',
                                  '<NEXT_INSTR>',
                                  '<INSTR_END>',
                                  '<TITLE_START>'
                                  ,'<TITLE_END>'
                                  ,'<RECIPE_END>'
        ]
}

**Resizeing the Model to Fit the Tokenizer with Special Tokens**

In [None]:
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

**Converting the Train and Validation Dataset to Pytorch Dataset so as it can be given to the Model as Input for Training**

In [None]:
train_dataset = (get_dataset(tokenizer=tokenizer))
eval_dataset = (get_dataset(tokenizer=tokenizer, evaluate=True))

('Loading features from cached file %s', '/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_temp.h5')
('Loading features from cached file %s', '/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_temp.h5')


**To be able to build batches, data collators may apply some processing (like padding).Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) oin the formed batch.
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.Forming the batches to dataset to be trained
source :- Hugginface.co**

In [None]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False, mlm_probability=0.15  )

In [None]:
training_args = TrainingArguments(
    
    output_dir= "/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/outputs",
    
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    fp16=True,
    fp16_opt_level='O1',
    warmup_steps=1e2,    
    learning_rate=5e-4,
    adam_epsilon=1e-8,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

**Initializing PyTorch Trainer**

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


**Saving the Tokenizer Object & Starting Training and Saving the model after Finishing the training**

In [None]:
tokenizer.save_pretrained('/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/outputs')
trainer.train()
trainer.save_model() 

**Saving the Tokenizer**

In [None]:
tokenizer.save_pretrained('/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/outputs')