In [1]:
import numpy as np
import torch
import torch.nn as nn
import gpt_model
import train_model
import tiktoken
import load_pretrained_weights
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 24.1kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 3.00MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 68.9kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:17<00:00, 28.7MiB/s] 
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 1.05MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 1.73MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.74MiB/s]


Output:
 Every effort moves you toward finding an ideal new way to practice something!

What makes us want to be on top of that?


File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [2]:
print("Settings: ", settings)
print()
print("Params keys: ", params.keys())
print("Params token embedding weights: ", params["wte"])
print("Token embedding weights shape: ", params["wte"].shape)

Settings:  {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

Params keys:  dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])
Params token embedding weights:  [[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
Token embedding weights shape:  (50257, 768)


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = tiktoken.get_encoding("gpt2")

gpt = gpt_model.GPTModel(
    vocab_size = 50257,
    context_length = 1024,
    emb_dim = 768,
    num_heads = 12,
    num_layers = 12,
    drop_rate = 0.0,
    qkv_bias = True
)
gpt.eval()

GPTModel(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(1024, 768)
  (dropout_embedding): Dropout(p=0.0, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (attention): Attention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (attention): Attention(
        (

In [4]:
import pandas as pd
data_file_path = "SMSSpamCollection"
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])

In [5]:
df["Label"].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
def create_balanced_dataset(df):
    num_spam = df[df["Label"] == "spam"].shape[0] # num instances of spam
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123) # randomly sample ham instances to match number of spam instances
    balanced_df = pd.concat([ham_subset, df[df["Label"]=="spam"]])
    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [9]:
# split dataset: 70% for training, 10% for validation, 20% for testing

def random_split(df, train_frac, validation_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)
    
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    
    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)    

In [13]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [14]:
import spam_dataset

train_dataset = spam_dataset.SpamDataset(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)

val_dataset = spam_dataset.SpamDataset(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

test_dataset = spam_dataset.SpamDataset(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)

print(train_dataset.max_length)

120


In [None]:
from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False
)

# for input_batch, target_batch in train_loader:
#     pass

# print("Input batch dimensions: ", input_batch.shape)
# print("Label batch dimensions: ", target_batch.shape)
print()
print(f"{len(train_loader)} training batches")
print(f"{len(val_loader)} validation batches")
print(f"{len(test_loader)} testing batches")

130 training batches
19 validation batches
38 testing batches


In [23]:
import train_model
load_pretrained_weights.load_weights_into_gpt(gpt, params)
gpt.eval()

test_text = "Every effort moves you"
token_ids = train_model.generate(
    model=gpt,
    index=train_model.text_to_token_ids(test_text, tokenizer),
    max_new_tokens=15,
    context_size=1024
)

print(train_model.token_ids_to_text(token_ids, tokenizer))

NameError: name 'probas' is not defined