In [6]:
from datasets import load_dataset
import torch
import os, sys, json
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.utils.data import Dataset, DataLoader, random_split
import tqdm.notebook as tqdm
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Load the dataset
# dataset = load_dataset("declip/Minecraft-Server-Chat")
# print(dataset)

# # Use a small portion of the dataset for testing
# subset = dataset['train'].select(range(2000))

# # Concatenate username and message
# subset = subset.map(lambda x: {'input_text': x['username'] + ': ' + x['content']})

# Actually, we're gonna use a local dataset for this example
# it's a text file with already concatenated username and message, all we need to do is split it by newline
dataset = load_dataset('text', data_files='data/minecraft_chat.txt')
subset = dataset['train']
subset = subset.map(lambda x: {'input_text': x['text']})

# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
subset = subset.map(lambda x: {'input_ids': tokenizer.encode(x['input_text'])}, remove_columns=['input_text'])

# Custom Dataset
class MinecraftChatDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]['input_ids'], dtype=torch.long)

# Create dataset and split into train and validation
dataset = MinecraftChatDataset(subset)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Set pad_token_id to eos_token_id
tokenizer.pad_token = tokenizer.eos_token

def collate_fn(batch):
    input_ids = pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    return input_ids, input_ids

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Print sample from train_loader
for batch in train_loader:
    print(batch)
    break

# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.train()

# Training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Adjust the number of epochs as needed
    for batch in (bar := tqdm(train_loader)):
        optimizer.zero_grad()
        inputs, labels = batch
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        bar.set_description(f"Epoch: {epoch}, Loss: {loss.item()}")

    # Validation step (optional)
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for batch in val_loader:
            inputs, labels = batch
            outputs = model(inputs, labels=labels)
            val_loss += outputs.loss.item()
        val_loss /= len(val_loader)
        print(f"Validation Loss: {val_loss}")
    model.train()


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

(tensor([[   47,   405, 35435,    25,   545,  1682,   319, 50256, 50256, 50256,
         50256, 50256, 50256],
        [   47, 35916,  6420,    25,   645, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256],
        [   47,    23,  2548,  5332,    25,   326,   373, 30806,   508,  2840,
         50256, 50256, 50256],
        [   47, 41583,  2154,    25,  1011,   326, 50256, 50256, 50256, 50256,
         50256, 50256, 50256],
        [   47,  2931,    23,  4869,    25,   582,    68,  1521,   673,   466,
           326, 50256, 50256],
        [   47,  4524, 48564,    25,  1494,   502,   611,   345,   765,  1312,
           760,   345,   481],
        [   47,  3388, 23815,    25,  1521, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256],
        [   47,  1731,  4089,    19,    25,   334,  4656,   257,  1256, 50256,
         50256, 50256, 50256]]), tensor([[   47,   405, 35435,    25,   545,  1682,   319, 50256, 50256, 50256,
         50256, 50256, 50256],
       

Epoch: 0, Loss: 2.164546489715576: 100%|██████████| 1000/1000 [19:34<00:00,  1.17s/it]


Validation Loss: 2.0488290493488313


Epoch: 1, Loss: 2.5219545364379883:  50%|████▉     | 497/1000 [09:46<09:53,  1.18s/it]


KeyboardInterrupt: 

In [27]:
tokenizer.encode("Hello, world!")

[15496, 11, 995, 0]

In [12]:
from nanoid import generate
import string

username_map = {}
def get_anon_username(username):
    if username not in username_map:
        username_map[username] = f"P{generate(string.digits, 5)}"
    return username_map[username]

In [25]:
# Save the fine-tuned model
save_path = "data/fine_tuned_gpt2_minecraft"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

# Function to generate responses
def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    output_ids = model.generate(input_ids, max_new_tokens=5, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id, num_beams=5, do_sample=True, top_k=50, top_p=0.95)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"${output_ids=}\n{response=}$")
    return response

# Example usage
response = generate_response(f"{get_anon_username('Notch')}: Hello!\nfdsdffdsdsf sdf sDf43t4 3t4[]")
print(response)

$output_ids=tensor([[   47,    23,  1433,  1270,    25, 18435,     0,   198,    69,  9310,
            67,   487,  9310,  9310,    69,   264,  7568,   264,    35,    69,
          3559,    83,    19,   513,    83,    19, 21737, 50256]])
response='P81630: Hello!\nfdsdffdsdsf sdf sDf43t4 3t4[]'$
P81630: Hello!
fdsdffdsdsf sdf sDf43t4 3t4[]
