<a href="https://colab.research.google.com/github/rajaamani/Zocket_Task/blob/main/Zocket_Task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
# Install necessary libraries
!pip install transformers wandb
!pip install torch torchvision



In [53]:
!pip install transformers[torch] accelerate>=0.20.1


In [54]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import wandb
import tensorflow_datasets as tfds


In [55]:
# Load the Shakespeare dataset
dataset = tfds.load("tiny_shakespeare", split="train")

In [56]:
# Extract text from the dataset
raw_text = "\n".join([example["text"].numpy().decode("utf-8") for example in dataset])

In [38]:
# Tokenize the text using GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenized_text = tokenizer(raw_text, return_tensors="pt", max_length=1024, truncation=True, padding=True)

In [39]:
# Save tokenized data to a file
with open("tokenized_data.txt", "w") as file:
    for seq in tokenized_text['input_ids']:
        file.write(" ".join(map(str, seq.tolist())) + "\n")

In [40]:
# Create TextDataset and DataCollatorForLanguageModeling
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="tokenized_data.txt",
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [44]:
# Fine-tune the model for prefix language modeling
model = GPT2LMHeadModel.from_pretrained('gpt2')


In [45]:
# Define training parameters
output_dir = "./prefix_lm_fine_tuned"
per_device_train_batch_size = 2
num_train_epochs = 3
save_total_limit = 2

In [50]:
# Training loop
for epoch in range(num_train_epochs):
    for batch in train_dataset:

        input_ids = batch
        labels = input_ids[1:].contiguous()
        prefix_ids = input_ids[:-1].contiguous()

        # Forward pass and training logic
        outputs = model(input_ids=prefix_ids, labels=labels)
        loss = outputs.loss



In [51]:
# Save the model
model.save_pretrained(output_dir)