In [1]:
!pip install transformers torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import urllib.request

# Load the dataset
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
response = urllib.request.urlopen(url)
data = response.read()
text = data.decode('utf-8')

In [3]:
from transformers import GPT2Tokenizer
import torch

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode the text
encoded_text = tokenizer.encode(text)

# Create input sequences
input_sequences = []
for i in range(0, len(encoded_text) - 1024, 1024):
    input_sequences.append(encoded_text[i:i+1024])

# Pad the sequences
max_length = max([len(seq) for seq in input_sequences])
padded_sequences = []
for seq in input_sequences:
    padded_sequences.append(seq + [tokenizer.pad_token_id] * (max_length - len(seq)))

# Convert to PyTorch tensors
input_ids = torch.tensor(padded_sequences)

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


In [4]:
!apt-get install -y build-essential libffi-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libffi-dev is already the newest version (3.3-4).
build-essential is already the newest version (12.8ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [5]:
import torch
from transformers import GPT2LMHeadModel, GPT2Config
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.nn.utils.rnn import pad_sequence



# Initialize the model
model_config = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', config=model_config)

# Convert the model to the desired device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Fine-tune the model
optimizer = Adam(model.parameters(), lr=5e-5, weight_decay=0.001)
scheduler = CosineAnnealingLR(optimizer, T_max=25)  
batch_size = 8  
gradient_accumulation_steps = 8  
max_sequence_length = 256  

model.train()

for epoch in range(25):  
    total_loss = 0.0
    total_batches = 0
    accumulated_batches = 0

    print(f"Starting epoch {epoch}...")

    for i in range(0, len(input_ids), batch_size):
        batch_input_ids = input_ids[i:i+batch_size]

        # Truncate or split the input to the maximum sequence length
        batch_input_ids = [ids[:max_sequence_length] for ids in batch_input_ids]

        optimizer.zero_grad()

        # Convert the batch_input_ids to tensor and move to device
        batch_input_ids = pad_sequence([torch.tensor(ids) for ids in batch_input_ids], batch_first=True).to(device)

        outputs = model(input_ids=batch_input_ids, labels=batch_input_ids)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps  # Scale the loss
        loss.backward()

        accumulated_batches += 1
        total_loss += loss.item()

        if accumulated_batches == gradient_accumulation_steps:
            optimizer.step()
            scheduler.step()
            accumulated_batches = 0

        total_batches += 1

        if total_batches % 100 == 0:
            average_loss = total_loss / total_batches
            print(f'Epoch {epoch}, Batch {total_batches}, Average Loss {average_loss:.4f}')

   
    if accumulated_batches > 0:
        optimizer.step()
        scheduler.step()

    average_loss = total_loss / total_batches
    print(f'Epoch {epoch}, Average Loss {average_loss:.4f}')

    # Clear cache to free up GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    print(f"Finished epoch {epoch}.")

# Save the further fine-tuned model
model.save_pretrained('/gpt2-shakespeare')
tokenizer.save_pretrained('./gpt2-shakespeare')


Starting epoch 0...


  batch_input_ids = pad_sequence([torch.tensor(ids) for ids in batch_input_ids], batch_first=True).to(device)


Epoch 0, Average Loss 0.5378
Finished epoch 0.
Starting epoch 1...
Epoch 1, Average Loss 0.5106
Finished epoch 1.
Starting epoch 2...
Epoch 2, Average Loss 0.5054
Finished epoch 2.
Starting epoch 3...
Epoch 3, Average Loss 0.5028
Finished epoch 3.
Starting epoch 4...
Epoch 4, Average Loss 0.5025
Finished epoch 4.
Starting epoch 5...
Epoch 5, Average Loss 0.5022
Finished epoch 5.
Starting epoch 6...
Epoch 6, Average Loss 0.5010
Finished epoch 6.
Starting epoch 7...
Epoch 7, Average Loss 0.4986
Finished epoch 7.
Starting epoch 8...
Epoch 8, Average Loss 0.4951
Finished epoch 8.
Starting epoch 9...
Epoch 9, Average Loss 0.4940
Finished epoch 9.
Starting epoch 10...
Epoch 10, Average Loss 0.4929
Finished epoch 10.
Starting epoch 11...
Epoch 11, Average Loss 0.4919
Finished epoch 11.
Starting epoch 12...
Epoch 12, Average Loss 0.4921
Finished epoch 12.
Starting epoch 13...
Epoch 13, Average Loss 0.4921
Finished epoch 13.
Starting epoch 14...
Epoch 14, Average Loss 0.4918
Finished epoch 14.


('./gpt2-shakespeare12/tokenizer_config.json',
 './gpt2-shakespeare12/special_tokens_map.json',
 './gpt2-shakespeare12/vocab.json',
 './gpt2-shakespeare12/merges.txt',
 './gpt2-shakespeare12/added_tokens.json')

In [8]:
# TO BE OR NOT TO BE(TEST!!!!!!)
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer


model_path = './gpt2-shakespeare'
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


model.eval()


prompt = "juliet"
input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

output = model.generate(input_ids, max_length=100, num_return_sequences=1)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


julét, and the king's son, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter, and the queen's daughter,


In [28]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import shutil

source_path = '/content/gpt2-shakespeare12'  # Replace with the actual path of the zip file
destination_path = '/content/drive/MyDrive/my_directory12.zip'  # Replace with the desired destination path in your Google Drive

shutil.move(source_path, destination_path)

'/content/drive/MyDrive/my_directory12.zip'