In [1]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = GPT2Model.from_pretrained('gpt2')
model.eval()

text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt', truncation=True)

with torch.no_grad():
    output = model(**encoded_input)

token_embeddings = output.last_hidden_state
print(token_embeddings.shape)
# Result: token embedding shape is [1, 10, 768]

print(token_embeddings)


torch.Size([1, 10, 768])
tensor([[[ 0.1629, -0.2166, -0.1410,  ..., -0.2619, -0.0819,  0.0092],
         [ 0.4628,  0.0248, -0.0785,  ..., -0.0859,  0.5122, -0.3939],
         [-0.0644,  0.1551, -0.6306,  ...,  0.2488,  0.3691,  0.0833],
         ...,
         [-0.5591, -0.4490, -1.4540,  ...,  0.1650, -0.1302, -0.3740],
         [ 0.1400, -0.3875, -0.7916,  ..., -0.1780,  0.1824,  0.2185],
         [ 0.1721, -0.2420, -0.1124,  ..., -0.1068,  0.1205, -0.3213]]])


In [3]:
token_embeddings = token_embeddings.mean(dim=1).squeeze()
token_embeddings.shape

torch.Size([768])

In [4]:
token_embeddings

tensor([-1.4419e-01, -6.5792e-02, -6.6249e-01,  8.4424e-02, -6.6759e-03,
         1.5365e-01,  3.7242e+00, -2.5630e-01,  2.9933e-02, -1.7607e-01,
         1.7791e-01, -2.1023e-01, -2.4490e-01,  5.7506e-02, -2.2517e-01,
        -2.3024e-01, -7.9503e-03, -3.9460e-01,  3.9133e-01,  1.4674e-02,
        -1.0418e-02, -1.7368e-01, -1.5253e-01,  4.1727e-02,  1.0535e-01,
         1.0266e-02, -5.3535e-01, -6.1404e-02,  2.4219e-01,  2.6487e-01,
        -9.8582e-02, -8.7298e-02, -1.1706e-01, -4.0085e-01, -2.9286e-01,
        -4.3727e-01,  6.1490e+01,  2.2263e-01,  1.6192e-01,  3.4314e-01,
        -3.0768e-01,  2.0968e-01,  1.7426e-01, -1.6834e-01, -3.3847e-02,
        -2.6481e-01, -9.5405e-02, -5.4566e-01, -1.0857e-01,  9.8272e-01,
         1.0347e-01,  2.5673e-01, -1.5698e-01,  1.5115e-01,  1.9587e-02,
         4.2705e-01,  3.4368e-02,  4.2566e-02,  3.0398e-02, -1.1355e-01,
         1.5174e-01,  7.8775e-02,  6.1441e-02, -1.0152e-01, -1.0989e+00,
         9.3133e-02,  2.1198e-01, -4.3320e-01, -3.6

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model = GPT2Model.from_pretrained('gpt2').to('mps')
model.eval()

tokenizer.pad_token = tokenizer.eos_token

def get_embedding_tensor(text):
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to('mps')
    with torch.no_grad():
        output = model(**encoded_input)
    token_embeddings = output.last_hidden_state
    token_embeddings = token_embeddings.mean(dim=1).squeeze()
    return token_embeddings

In [6]:
from torch import nn

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        # First layer goes from 768 -> 256
        # Second layer goes from 256 -> 64
        # Third layer goes from 64 -> 16
        # Fourth layer goes from 16 -> 1

        self.layer1 = nn.Linear(768, 256, bias=True)
        self.layer2 = nn.Linear(256, 64, bias=True)
        self.layer3 = nn.Linear(64, 16, bias=True)
        self.layer4 = nn.Linear(16, 1, bias=True)

        self.activation = nn.ReLU()

        self.layers = nn.Sequential(self.layer1, self.activation,
                                    self.layer2, self.activation,
                                    self.layer3, self.activation,
                                    self.layer4, nn.Sigmoid())        

    def forward(self, x):
        # x: (batch, 768)

        x = self.layers(x)
        return x

In [7]:
from torch.utils.data import DataLoader
from datasets import load_dataset
from torch.optim import AdamW

In [8]:
dataset = load_dataset("stanfordnlp/sst2")
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [9]:
dataset_train = dataset['train']
dataset_test = dataset['test']
dataset_validation = dataset['validation']

In [10]:
dataset_train[0]

{'idx': 0,
 'sentence': 'hide new secretions from the parental units ',
 'label': 0}

In [11]:
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset_validation, shuffle=False)
test_loader = DataLoader(dataset_test, shuffle=False)

In [12]:
mlp = Model()
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

mlp.to(device)

print(device)
print(mlp)

mps
Model(
  (layer1): Linear(in_features=768, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=16, bias=True)
  (layer4): Linear(in_features=16, out_features=1, bias=True)
  (activation): ReLU()
  (layers): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=16, bias=True)
    (5): ReLU()
    (6): Linear(in_features=16, out_features=1, bias=True)
    (7): Sigmoid()
  )
)


In [13]:
criterion = nn.BCELoss()
optimizer = AdamW(mlp.parameters(), lr=0.001)
epochs = 100

In [14]:
def train_loop(dataloader=train_loader, model=mlp, loss_fn=criterion, optimizer=optimizer):
    
    size = len(dataloader.dataset)

    model.train()
    for batch in iter(dataloader):

        # print("xxxx")
        # print(batch)
        # print("xxxx")

        # Data is idx, sentence, label
        text = batch['sentence']
        label = batch['label']

        # print(text)
        # print(label)

        X = get_embedding_tensor(text)
        X = X.to(device)
        y = label.to(device)

        pred = model(X)
        pred = pred.squeeze(1)

        print(y)
        print(pred)

        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        loss = loss.item()
        print(f"Loss: {loss}")

    return loss.item()


def validation_loop(dataloader=val_loader, model=mlp, loss_fn=criterion):
    
    model.eval()

    size = len(dataloader.dataset)

    val_loss = 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X.to(device))
            val_loss += loss_fn(pred, y.to(device)).item()

    val_loss /= size
    return val_loss

In [15]:
# Training loop

early_stop_ctr = 0
curr_val_loss = 10000
prev_val_loss = 10000

train_losses = []
val_losses = []

for epoch in range(epochs):

    if early_stop_ctr > 5:
        break

    print(f"Epoch {epoch + 1}\n-------------------------------")
    train_loss = train_loop()
    val_loss = validation_loop()

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    if val_loss < prev_val_loss:
        early_stop_ctr = 0
    else:
        early_stop_ctr += 1

    prev_val_loss = curr_val_loss
    curr_val_loss = val_loss

Epoch 1
-------------------------------


: 