In [2]:
!pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers[torch]
  Using cached transformers-4.39.3-py3-none-any.whl (8.8 MB)
Collecting huggingface-hub<1.0,>=0.19.3
  Using cached huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 2.5 MB/s eta 0:00:01
[?25hCollecting tokenizers<0.19,>=0.14
  Downloading tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 101.9 MB/s eta 0:00:01
Collecting accelerate>=0.21.0
  Using cached accelerate-0.29.1-py3-none-any.whl (297 kB)
Collecting torch
  Downloading torch-2.2.2-cp38-cp38-manylinux1_x86_64.whl (755.5 MB)
[K     |█████████████▏                  | 310.7 MB 159.7 MB/s eta 0:00:03

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████▋   | 676.7 MB 139.6 MB/s eta 0:00:01

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 755.5 MB 27 kB/s /s eta 0:00:01
Collecting fsspec>=2023.5.0
  Using cached fsspec-2024.3.1-py3-none-any.whl (171 kB)
Collecting nvidia-cusparse-cu12==12.1.0.106
  Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)
Collecting nvidia-cuda-cupti-cu12==12.1.105
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cufft-cu12==11.0.2.54
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-cublas-cu12==12.1.3.1
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cusolver-cu12==11.4.5.107
  Using cached nvidia_c

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup

In [2]:
if torch.__version__:
    print("PyTorch is installed. Version:", torch.__version__)
else:
    print("PyTorch is not installed.")

PyTorch is installed. Version: 2.2.2+cu121


In [3]:

# Check if GPU is available
if torch.cuda.is_available():
    # Set device to GPU
    device = torch.device("cuda")
    print("GPU is available")
else:
    # Set device to CPU
    device = torch.device("cpu")
    print("GPU is not available, falling back to CPU")

# Example tensor creation
tensor = torch.randn(3, 3).to(device)

# Check which device the tensor is on
print("Tensor device:", tensor.device)

GPU is available
Tensor device: cuda:0


In [4]:
import json
import os
import pandas as pd


In [5]:
# Check if CUDA (GPU acceleration) is available, and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Import the GPT2Tokenizer and GPT2LMHeadModel from the "gpt2" pre-trained model
try:
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    print("Tokenizer and model imported successfully.")
except Exception as e:
    print("An error occurred:", e)

# Set the padding token of the tokenizer to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

Tokenizer and model imported successfully.


In [7]:

class HTMLDataset(Dataset):
    def __init__(self, prompts, html_outputs, max_length=1024):
        """
        Initializes the HTMLDataset class.

        Args:
        - prompts (list): List of prompts (input sequences).
        - html_outputs (list): List of HTML outputs (target sequences).
        - max_length (int): Maximum length of each sequence chunk.

        Returns:
        None
        """
        self.prompts = prompts
        self.html_chunks = []
        self.prompt_chunk_mapping = []
        self.max_length = max_length

        for i, html_output in enumerate(html_outputs):
            # Split each HTML output into chunks of maximum length `max_length`
            chunks = [html_output[j:j+max_length] for j in range(0, len(html_output), max_length)]
            self.html_chunks.extend(chunks)
            
            # Store the index of the prompt corresponding to each chunk
            self.prompt_chunk_mapping.extend([i] * len(chunks))

    def __len__(self):
        """
        Returns the total number of samples in the dataset.

        Args:
        None

        Returns:
        int: Total number of samples
        """
        return len(self.html_chunks)

    def __getitem__(self, idx):
        """
        Returns a prompt and its corresponding HTML chunk for the given index.

        Args:
            idx (int): The index of the item to retrieve.

        Returns:
            tuple: A tuple containing the prompt tensor and the HTML chunk tensor.
        """
        prompt_idx = self.prompt_chunk_mapping[idx]
        prompt = tokenizer.encode(self.prompts[prompt_idx], return_tensors="pt", padding="max_length", truncation=True, max_length=1024)
        html_chunk = tokenizer.encode(self.html_chunks[idx], return_tensors="pt", padding="max_length", max_length=self.max_length)
        return prompt, html_chunk

    def collate_fn(self, batch):
        """
        Collates a batch of data samples into a format suitable for the model.

        Args:
        - batch (list): List of samples, where each sample is a tuple containing the prompt and HTML chunk.

        Returns:
            tuple: Tuple containing the padded prompts and HTML chunks.
        """
        prompts, html_outputs = zip(*batch)

        prompts = pad_sequence(prompts, batch_first=True, padding_value=tokenizer.pad_token_id)
        html_outputs = pad_sequence(html_outputs, batch_first=True, padding_value=tokenizer.pad_token_id)

        return prompts, html_outputs

In [8]:
prompts = [] # List of prompts
html_outputs = [] # List of corresponding HTML outputs
# dataset = HTMLDataset(prompts, html_outputs)

In [9]:
file1 = 'files/105000.json'

In [10]:
'''
    Opening the json file and read content using pandas.
    Get 'prompt' and 'output' columns from the DataFrame and update them in Python lists.
'''
with open(file1, 'r') as f:
  data = pd.read_json(f)
  prompts = data['prompt'].tolist()
  html_outputs = data['output'].tolist()

In [17]:
prompts[len(prompts)-1]

'create a Arts website for Rick Astley Headquarters Intro'

In [12]:
len(prompts)

500

In [11]:
# Limiting the number of prompts to 500 for model fine tuning by slicing the 'prompts' list
prompts = prompts[:500]
html_outputs = html_outputs[:500]
len(html_outputs)

In [15]:
# Finding the length of the longest HTML output in the 'html_outputs' list
len(max(html_outputs, key=len))

1563941

In [14]:
# Create a HTMLDataset object by passing prompts and HTML outputs as inputs
dataset = HTMLDataset(prompts, html_outputs)

In [15]:
# The batch size for the dataloader
batch_size = 3

'''Create a DataLoader object to efficiently load data in batches:
 - `dataset`: The dataset to load batches from
 - `batch_size`: The number of samples to include in each batch
 - `shuffle=True`: Shuffle the dataset before creating batches to improve randomness and prevent model overfitting
 - `collate_fn=dataset.collate_fn`: Specifies a function to collate (combine) individual data samples into batches.
   In this case, `dataset.collate_fn` is a custom function defined in the HTMLDataset class to pad sequences in batches.
'''

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=dataset.collate_fn)


In [17]:
type(dataset)

__main__.HTMLDataset

In [16]:
# Define the number of epochs for training
num_epochs = 2

In [17]:
# Defining the optimizer for updating the model parameters using the AdamW optimizer with a learning rate of 0.01.
optimizer = AdamW(model.parameters(), lr=0.01)

# Calculating the total number of training steps by multiplying the number of batches (length of the dataloader) by the number of epochs.
total_steps = len(dataloader) * num_epochs

'''
    Defining the learning rate scheduler with warm-up for controlling the learning rate during training.
    It gradually increases the learning rate during the warm-up phase and then decreases it linearly for the remaining training steps.
'''
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [18]:
# Moving the model to the specified device (CPU or GPU) for training and set the model in training mode to enable gradients computation and parameter updates.
model.to(device)
model.train()

# Iterating through each epoch in the specified number of epochs.
for epoch in range(num_epochs):
    print('epoch: ', epoch)
    count = 0 # Increment the batch counter
    
    # Iterate through each batch in the dataloader.
    for batch in dataloader:
        count += 1
        
        # Unpack the batch into prompts and HTML outputs
        prompts, html_outputs = batch
        
        # Move the prompts and HTML outputs to the specified device
        prompts = prompts.to(device)
        html_outputs = html_outputs.to(device)
        
        # Pass the prompts and labels through the model to get the outputs
        outputs = model(prompts, labels=html_outputs)
        
        # Get the loss from the model outputs
        loss = outputs.loss
        
        # Perform backward pass to compute gradients.
        loss.backward()
        
        # Update model parameters using the optimizer.
        optimizer.step()
        
        # Adjust the learning rate scheduler
        scheduler.step()
        
        # Clear gradients after updating parameters.
        optimizer.zero_grad()
    print(count)
# Save the fine-tuned model after training 
model.save_pretrained("gpt2-html-generator")

# Save the entire model (including optimizer and scheduler states) for future use.
torch.save(model, 'entire_model.pt')

epoch:  0


KeyboardInterrupt: 

In [102]:
torch.cuda.memory_summary(device=None, abbreviated=False)




In [101]:
torch.cuda.empty_cache()
