In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [3]:
import torch
import torch.nn as nn
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

from transformers import Trainer, TrainingArguments

from tqdm.auto import tqdm

import pandas as pd
import numpy as np

In [4]:
model_name = 'gpt2'

In [5]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>'
                                             )
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

In [6]:
model_save_path = './drashtiGPT'

In [7]:
tokenizer.save_pretrained(model_save_path)

('./drashtiGPT/tokenizer_config.json',
 './drashtiGPT/special_tokens_map.json',
 './drashtiGPT/vocab.json',
 './drashtiGPT/merges.txt',
 './drashtiGPT/added_tokens.json',
 './drashtiGPT/tokenizer.json')

In [8]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

[50259]

In [9]:
def generate(prompt):
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt')
    output = model.generate(**inputs,max_length=256,do_sample=True,pad_token_id=50259)
    print(tokenizer.decode(output[0]))

In [10]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|unknown|>',
 'pad_token': '<|pad|>'}

In [11]:
tokenizer.convert_tokens_to_ids(['<|startoftext|>'],)

[50257]

In [12]:
clean = pd.read_csv('Cleaned_Indian_Food_Dataset.csv')
clean = clean.sample(frac=1)
clean.reset_index(drop=True,inplace=True)

In [13]:
def print_recipe(idx):
    print(f"{clean['ingredients'][idx]}\n\n{clean['instructions'][idx]}")

In [14]:
def form_string(ingredient,instruction):
    s = f"<|startoftext|>Ingredients:\n{ingredient.strip()}\n\nInstructions:\n{instruction.strip()}<|endoftext|>"
    return s

In [15]:
data = clean.apply(lambda x:form_string(x['TranslatedIngredients'],x['TranslatedInstructions']),axis=1).to_list()

In [16]:
train_size = 0.85
train_len = int(train_size * len(data))
train_data = data[:train_len]
val_data = data[train_len:]

In [17]:
class RecipeDataset:
    def __init__(self,data):
        self.data = data
        self.input_ids = []
        self.attn_masks = []
        
        for item in tqdm(data):
            encodings = tokenizer.encode_plus(item,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=1024,
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'],0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'],0))
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_masks[idx] 

In [18]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[0] for item in batch])
    }

In [19]:
train_ds = RecipeDataset(train_data)
val_ds = RecipeDataset(val_data)

  0%|          | 0/5047 [00:00<?, ?it/s]

  0%|          | 0/891 [00:00<?, ?it/s]

In [20]:
args = TrainingArguments(output_dir=model_save_path,
                         per_device_train_batch_size=2,
                         per_device_eval_batch_size=2,
                         gradient_accumulation_steps=2,
                         report_to='none',
                         num_train_epochs=3,
                         save_strategy='no'
                        )

In [21]:
optim = torch.optim.AdamW(model.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim,20,eta_min=1e-7)

In [22]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  data_collator=collate_fn,
                  optimizers=(optim,scheduler)
                 )

In [23]:
trainer.train()

Step,Training Loss
500,1.1802
1000,0.8316
1500,0.7639
2000,0.7364
2500,0.7286
3000,0.6973
3500,0.6978


TrainOutput(global_step=3786, training_loss=0.7959782902172994, metrics={'train_runtime': 5145.2888, 'train_samples_per_second': 2.943, 'train_steps_per_second': 0.736, 'total_flos': 7912445313024000.0, 'train_loss': 0.7959782902172994, 'epoch': 3.0})

In [24]:
trainer.save_model()

In [25]:
from transformers import pipeline

In [28]:
pl = pipeline(task='text-generation',model='./drashtiGPT')

In [29]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"<|startoftext|>Ingredients:\n{ingredients}\n"
    return s

In [49]:
ingredients = ['Rice,Potatoes,Tomatoes,Spinach,red bell peppers','chicken,tomatoes,aloo,jeera,curry powder']

In [50]:
for ing in ingredients:
    prompt = create_prompt(ing)
    print(pl(prompt,
         max_new_tokens=512,
         penalty_alpha=0.6,
         top_k=4,
         pad_token_id=50259
        )[0]['generated_text'])

<|startoftext|>Ingredients:
rice
potatoes
tomatoes
spinach
red bell peppers

parsley leaves - few,2 tablespoons water,1 cup spinach leaves,1/4 cup water,parsley leaves,1/4 cup green beans,1/2 teaspoon salt,4 cloves garlic

 irresistible

Instructions:
To begin making the Vegetables With Vegetables Recipe, we first need to get all of the vegetables ready for the dish.
Soak the spinach leaves in a pressure cooker for about 2 whistles, until soft and mushy.
Once soft, drain the water.Now heat some oil on a medium flame and add in the spinach leaves and cook for about 2 minutes.
Turn off the flame.
Once spinach leaves have cooked, add in the garlic, green beans, spinach, red bell pepper, coriander, salt and cook until the beans are cooked through.
You can optionally add a little water if required to adjust the consistency.
Turn off the flame.Serve Vegetables With Vegetables Recipe along with Steamed Rice, Steamed Rice and a cup of coffee to make it a complete meal for the weeknight dinner.