In [1]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [2]:
device

'cuda'

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [5]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

class YelpDataset(Dataset):
    def __init__(self):
        super().__init__()

        self.food_list = []
        self.end_of_text_token = "<|endoftext|>"

        with open("/kaggle/input/yelp-for-sentiment/yelp.csv") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')

            x = 0
            for row in csv_reader:
                joke_str = f"recepies:{row[0]}{self.end_of_text_token}"
                self.food_list.append(joke_str)

    def __len__(self):
       return len(self.food_list)

    def __getitem__(self, item):
        return self.food_list[item]

In [6]:
dataset = YelpDataset()
yelp_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [7]:
BATCH_SIZE = 4
EPOCHS = 7
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 800
from transformers import AdamW, get_linear_schedule_with_warmup
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [8]:
device

'cuda'

In [9]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_yelp_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,yelp in enumerate(yelp_loader):
        
        #################### "Fit as many recipes sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        yelp_tens = torch.tensor(tokenizer.encode(yelp[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if yelp_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first recipes sequence in the sequence
        if not torch.is_tensor(tmp_yelp_tens):
            tmp_yelp_tens = yelp_tens
            continue
        else:
            #The next recipes does not fit in so we process the sequence and leave the last recipes 
            #as the start for next sequence 
            if tmp_yelp_tens.size()[1] + yelp_tens.size()[1] > MAX_SEQ_LEN:
                work_yelp_tens = tmp_yelp_tens
                tmp_yelp_tens = yelp_tens
            else:
                #Add the recipes to sequence, continue and try to add more
                tmp_yelp_tens = torch.cat([tmp_yelp_tens, yelp_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_yelp_tens, labels=work_yelp_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Store the model after each epoch to compare the performance of them
torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_food_{epoch}.pt"))

sum loss 2048.149658203125
sum loss 1827.6429443359375
sum loss 1697.4271240234375
sum loss 1672.11474609375


In [19]:
import os
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_probs = probs[ind]
    top_probs = top_probs / np.sum(top_probs)  # Normalize
    chosen_index = np.random.choice(ind, 1, p=top_probs)
    return chosen_index[0]

MODEL_EPOCH = 6
models_folder = "trained_models"
model_path = os.path.join(models_folder, f"gpt2_medium_food_{MODEL_EPOCH}.pt")

# Ensure the device is properly set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.load_state_dict(torch.load(model_path, map_location=device))
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

model.to(device)
model.eval()

yelp_output_file_path = f'generated_{MODEL_EPOCH}.yelp'

if os.path.exists(yelp_output_file_path):
    os.remove(yelp_output_file_path)

yelp_num = 0
with torch.no_grad():
    for yelp_idx in range(5):
        yelp_finished = False
        cur_ids = torch.tensor(tokenizer.encode("yelp review: ")).unsqueeze(0).to(device)

        for i in range(100):
            outputs = model(cur_ids)
            logits = outputs.logits
            softmax_logits = torch.softmax(logits[0, -1], dim=0)  # Take the first batch and the last predicted embedding

            if i < 3:
                n = 20
            else:
                n = 3

            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)  # Randomly select the next word
            cur_ids = torch.cat([cur_ids, torch.ones((1, 1)).long().to(device) * next_token_id], dim=1)  # Add the last word to the running sequence

            # Token to check for stopping condition (adjust this as needed)
            if next_token_id == tokenizer.eos_token_id:
                yelp_finished = True
                break

        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)

        # Print the generated text
        print(f"Generated Text {yelp_num + 1}: {output_text}")

        with open(yelp_output_file_path, 'a') as f:
            f.write(f"{output_text} \n\n")

        # Print confirmation of saving the file
        print(f"Text {yelp_num + 1} saved to {yelp_output_file_path}")

        yelp_num += 1


Generated Text 1: yelp review:  @shaunmccarthy
Posted by Shaun McCarthy at 12:00 am<|endoftext|>
Text 1 saved to generated_6.yelp
Generated Text 2: yelp review:  It's a good book, and it's not a bad book. I think it could have used a little more of a twist or two to keep things interesting, and it could have been a little more fun. I think it would have been a lot more fun to see the book's ending.<|endoftext|>
Text 2 saved to generated_6.yelp
Generated Text 3: yelp review:  a great read.
Posted by  the_lady at 11:34 AM<|endoftext|>
Text 3 saved to generated_6.yelp
Generated Text 4: yelp review:  "I've been a fan of the book since it was published, so when it came out, I was excited to see what it would do. It's a fun read, and I'm glad I read it."<|endoftext|>
Text 4 saved to generated_6.yelp
Generated Text 5: yelp review:  http://www.youtube.com/watch?v=_0QQQQ-0__Q4&feature=youtu.be
Posted by  Lilith at 12:03 PM<|endoftext|>
Text 5 saved to generated_6.yelp
