In [2]:
# here we have 2 models to talk each other after fine tuning


In [1]:
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.nn.functional as F

import textwrap

from transformers import AutoModelForCausalLM, AutoTokenizer
from torchinfo import summary
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Eletuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer.pad_token_id = tokenizer.encode(' ')[0]

# load in 2 GPTneos and push to GPU
modelAlice = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
modelEdgar = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
modelAlice = modelAlice.to(device)
modelEdgar = modelEdgar.to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading weights: 100%|█| 160/160 [00:00<00:00, 1579.92it/s, Materializing param=
[1mGPTNeoForCausalLM LOAD REPORT[0m from: EleutherAI/gpt-neo-125m
Key                                                   | Status     |  | 
------------------------------------------------------+------------+--+-
transformer.h.{0, 2, 4, 6, 8, 10}.attn.attention.bias | UNEXPECTED |  | 
transformer.h.{0...11}.attn.attention.masked_bias     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Loading weights: 100%|█| 160/160 [00:00<00:00, 1924.12it/s, Materializing param=
[1mGPTNeoForCausalLM LOAD REPORT[0m from: EleutherAI/gpt-neo

In [3]:
# throught the looking glass (aka alice in wonderland)
text = requests.get('https://www.gutenberg.org/cache/epub/11/pg11.txt').text
aliceTokens = torch.tensor(tokenizer.encode(text), dtype=torch.long)

# edgar allan Poe
text = requests.get('https://www.gutenberg.org/cache/epub/2148/pg2148.txt').text
edgarTokens = torch.tensor(tokenizer.encode(text), dtype=torch.long)

Token indices sequence length is longer than the specified maximum sequence length for this model (52954 > 2048). Running this sequence through the model will result in indexing errors


In [4]:
# ALICE optmizer
optimizerAlice = torch.optim.AdamW(modelAlice.parameters(), lr=5e-5, weight_decay=.01)
optimizerEdgar = torch.optim.AdamW(modelEdgar.parameters(), lr=5e-5, weight_decay=.01)

In [5]:
seq_len = 256
batch_size = 16
num_samples = 100


In [6]:
tokenProbs = np.zeros((num_samples,3))

lossAlice = np.zeros(num_samples)
lossEdgar = np.zeros(num_samples)

for sampli in range(num_samples):
    # init batch losses to accumulate

    # ALICE fine tuning
    # get a batch of data
    ix = torch.randint(len(aliceTokens)-seq_len, size = (batch_size,))
    X = aliceTokens[ix[:,None] + torch.arange(seq_len)].to(device)

    #fwd pass and get loss
    modelAlice.zero_grad()
    outputs = modelAlice(X, labels=X)

    # backprop and store loss
    outputs.loss.backward()
    optimizerAlice.step()
    lossAlice[sampli] = outputs.loss.item()


    #EDGAR fine tuning
    ix = torch.randint(len(edgarTokens)-seq_len, size = (batch_size,))
    X = edgarTokens[ix[:,None] + torch.arange(seq_len)].to(device)

    #fwd pass and get loss
    modelEdgar.zero_grad()
    outputs = modelEdgar(X, labels=X)

    # backprop and store loss
    outputs.loss.backward()
    optimizerAlice.step()
    lossEdgar[sampli] = outputs.loss.item()

    if sampli%25==0:
        print(f'Sample: {sampli}/{num_samples}, losses (Alice/eDgar): {lossAlice[sampli]} / {lossEdgar[sampli]}')

Sample: 0/100, losses (Alice/eDgar): 2.551490068435669 / 2.704116106033325
Sample: 25/100, losses (Alice/eDgar): 1.97757887840271 / 2.6722307205200195
Sample: 50/100, losses (Alice/eDgar): 1.627395749092102 / 2.6514389514923096
Sample: 75/100, losses (Alice/eDgar): 1.4499162435531616 / 2.6057238578796387


Have the models chat with each other

In [7]:
# kick off the convo
outAlice = tokenizer.encode('Hello, my name is Alice.', return_tensors = 'pt').to(device)
print('\n\n** Alice says:\n', tokenizer.decode(outAlice[0].cpu()))

for _ in range(5):
    #Edgars turn
    outEdgar = modelEdgar.generate(outAlice, max_new_tokens = 50, do_sample=True, pad_token_id=50257)
    print(f'\n\n** Edgar says (total token count: {len(outEdgar[0])}):\n',
        tokenizer.decode(outEdgar[0][len(outAlice[0]):].cpu()))

    # alice turn
    outAlice = modelAlice.generate(outEdgar, max_new_tokens = 50, do_sample=True, pad_token_id=50257)
    print(f'\n\n** Alice says (total token count: {len(outAlice[0])}):\n',
        tokenizer.decode(outAlice[0][len(outEdgar[0]):].cpu()))



** Alice says:
 Hello, my name is Alice.


** Edgar says (total token count: 57):
  I'm a small boy who can read the signs and write messages. I'm a very busy man, living up to 20 years old. However, I am currently living in a home. I'm not quite that interested in playing games, nor do


** Alice says (total token count: 107):
  I want to draw a picture, so I sit and listen.




CHAPTER IX.
The Mock Turtle

Alice thought most of the day after dark—after work, in fact—


** Edgar says (total token count: 157):
 had started. Her brother, Bob, had been with her ever since they grew up. He'd always been very, very interested in what she could do to help him and she'd seen him as a different person to him and this made him change


** Alice says (total token count: 207):
  all the time: he'd always been a small fellow, with a large goatee and brown eyes, and he wore his tie the right size for all that makes him look so small and round; and it always struck her that Alice had always


** 