In [1]:
import math
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# excetions show up normally
%xmode plain

# plots show up inline
%matplotlib inline

def get_device(cpu_only=True):
    """
    Returns one of cuda / mps / cpu based on availablity
    """
    if cpu_only is True:
        return torch.device("cpu")
    elif torch.cuda.is_available():
        return torch.device("cuda")
    elif torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")


# Choose the default device
cpu_only = False
default_device = get_device(cpu_only)


# Needed to import modules from src
import sys
sys.path.append('..')


# Get Project Root (needed for reading config)
import os
projectRoot = os.path.dirname(os.getcwd())
print(projectRoot)


p = lambda X,s:print(f"{s}:{X.shape}")

Exception reporting mode: Plain
e:\Karpathy-GPT


In [5]:
%%time
from transformers import GPT2LMHeadModel

# model_hf = GPT2LMHeadModel.from_pretrained("gpt2-xl")  # Original GPT2 large 1.2B param model
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")

CPU times: total: 1.89 s
Wall time: 455 ms


In [6]:
sd_hf = model_hf.state_dict()

for k,v in sd_hf.items():
    print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [9]:
%%time

from transformers import pipeline, set_seed
set_seed(42)
generator = pipeline("text-generation", model='gpt2')

generator("Hello, how are you doing", max_length=30, num_return_sequences=5)

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


CPU times: total: 2.11 s
Wall time: 1.54 s


[{'generated_text': 'Hello, how are you doing?" you asked. "It\'s really difficult to have such hard feelings."\n\nThe second night passed like a miracle'},
 {'generated_text': 'Hello, how are you doing today? And how do I respond to it?\n\nJOSHUA: Well, for those of you unfamiliar with'},
 {'generated_text': "Hello, how are you doing here? I want more people to buy the game. [00:20] <RothV> (You're"},
 {'generated_text': 'Hello, how are you doing my son?\n\n"Mm, you can go back," she replied.\n\nAnd with that she went'},
 {'generated_text': "Hello, how are you doing? You are being accepted into the student housing community. Why don't we do a survey here? Let's get you"}]