<a href="https://colab.research.google.com/github/santiagoLopez1712/Agilesmanagementprojekt/blob/main/Generative_ki5_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
import requests
from tqdm.notebook import tqdm
import wandb
import math
import torch.nn.functional as F

# Inicializar Weights & Biases para el seguimiento de experimentos
wandb.init(project="generative-ki2")

import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

try:
    response = requests.get(url)
    response.raise_for_status()
    text = response.text
    print("Dataset Tiny Shakespeare descargado exitosamente.")
except requests.exceptions.RequestException as e:
    print(f"Error al descargar el dataset: {e}")
    text = None

if text:
    print(f"Primeros 100 caracteres del dataset:\n{text[:100]}")
else:
    print("No se pudo cargar el dataset, revisa el error.")

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Tokenizer GPT-2 cargado exitosamente.")
print(f"Tamaño del vocabulario del tokenizer: {tokenizer.vocab_size}")
print(f"Token de padding: '{tokenizer.pad_token}' con ID: {tokenizer.pad_token_id}")
print(f"Token de fin de secuencia: '{tokenizer.eos_token}' con ID: {tokenizer.eos_token_id}")

if text is not None:
    encoded_text = tokenizer.encode(text)
    print(f"Texto completo tokenizado. Longitud: {len(encoded_text)} tokens.")
    print(f"Primeros 50 tokens: {encoded_text[:50]}")
else:
    print("No hay texto para tokenizar. Asegúrate de haber descargado el dataset correctamente.")
    encoded_text = None

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mslopezotalvaro1712[0m ([33mslopezotalvaro1712-hochschule-hannover[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Dataset Tiny Shakespeare descargado exitosamente.
Primeros 100 caracteres del dataset:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Tokenizer GPT-2 cargado exitosamente.
Tamaño del vocabulario del tokenizer: 50257
Token de padding: '<|endoftext|>' con ID: 50256
Token de fin de secuencia: '<|endoftext|>' con ID: 50256


Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


Texto completo tokenizado. Longitud: 338025 tokens.
Primeros 50 tokens: [5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13, 198, 198, 5962, 22307, 25, 198, 1639, 389, 477, 12939, 2138, 284, 4656, 621, 284, 1145, 680, 30, 198, 198, 3237, 25, 198, 4965, 5634, 13]


In [2]:
import torch
from torch.utils.data import Dataset

class ShakespeareDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = torch.tensor(data, dtype=torch.long)
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + self.block_size + 1]
        return x, y

block_size = 128

if encoded_text is not None:
    dataset = ShakespeareDataset(encoded_text, block_size)
    print(f"Dataset de Shakespeare creado. Número total de bloques: {len(dataset)}")
    first_input, first_target = dataset[0]
    print(f"Primer input (tamaño {first_input.shape}): {first_input[:10]}")
    print(f"Primer target (tamaño {first_target.shape}): {first_target[:10]}")
else:
    print("No se pudo crear el dataset porque no se tokenizó el texto.")
    dataset = None

from torch.utils.data import DataLoader, random_split

if dataset is not None:
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    batch_size = 32

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    print(f"Tamaño del dataset de entrenamiento: {len(train_dataset)} bloques.")
    print(f"Tamaño del dataset de validación: {len(val_dataset)} bloques.")
    print(f"Tamaño del batch de entrenamiento: {batch_size}")
    print(f"Tamaño del batch de validación: {batch_size}")

    for batch in train_dataloader:
        inputs, targets = batch
        print(f"Shape del batch de entrada: {inputs.shape}")
        print(f"Shape del batch de objetivo: {targets.shape}")
        break
else:
    print("No se pueden crear los dataloaders porque el dataset no se creó.")
    train_dataloader = None
    val_dataloader = None

Dataset de Shakespeare creado. Número total de bloques: 337897
Primer input (tamaño torch.Size([128])): tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11])
Primer target (tamaño torch.Size([128])): tensor([22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,  3285])
Tamaño del dataset de entrenamiento: 304107 bloques.
Tamaño del dataset de validación: 33790 bloques.
Tamaño del batch de entrenamiento: 32
Tamaño del batch de validación: 32
Shape del batch de entrada: torch.Size([32, 128])
Shape del batch de objetivo: torch.Size([32, 128])


In [3]:
import torch
import torch.nn as nn
import math

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, heads, dropout, forward_expansion, max_len, device):
        super().__init__()
        self.device = device
        self.embed_size = embed_size
        self.num_layers = num_layers
        self.nhead = heads
        self.feedforward_dim = forward_expansion * embed_size
        self.max_len = max_len
        self.dropout = dropout
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_len, embed_size)
        self.dropout_layer = nn.Dropout(dropout)

        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_size, nhead=self.nhead,
                                                    dim_feedforward=self.feedforward_dim, dropout=dropout,
                                                    batch_first=False)

        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, x, mask):
        N, seq_len = x.shape
        positions = torch.arange(0, seq_len).expand(N, seq_len).to(self.device)
        embedded = self.dropout_layer(self.embedding(x) + self.position_embedding(positions))

        tgt = embedded.transpose(0, 1)
        memory = tgt

        tgt_mask = mask.bool()

        decoder_output = self.transformer_decoder(tgt=tgt, memory=memory, tgt_mask=tgt_mask)

        decoder_output = decoder_output.transpose(0, 1)

        output = self.fc_out(decoder_output)
        return output

    def generate(self, x, max_new_tokens):
        self.eval()
        with torch.no_grad():
            for _ in range(max_new_tokens):
                seq_len = x.shape[1]
                mask = torch.tril(torch.ones((seq_len, seq_len), device=self.device)).bool()
                output = self(x, mask)
                predictions = output[:, -1, :]
                predicted_id = torch.argmax(predictions, dim=-1).unsqueeze(-1)
                x = torch.cat((x, predicted_id), dim=1)
        self.train()
        return x

# --- Instantiation of the Model (using the new class) ---
vocab_size = tokenizer.vocab_size
embed_size = 8
num_layers = 1
heads = 1
dropout = 0.1
forward_expansion = 1
max_len = block_size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DecoderOnlyTransformer(
    vocab_size,
    embed_size,
    num_layers,
    heads,
    dropout,
    forward_expansion,
    max_len,
    device
).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of parameters (DecoderOnlyTransformer): {count_parameters(model)}")

Number of parameters (DecoderOnlyTransformer): 856161


In [4]:
def create_causal_mask(seq_len, device):
    mask = torch.tril(torch.ones((seq_len, seq_len), device=device)).bool()
    return mask

mask = create_causal_mask(block_size, device)
print(f"Shape of the causal mask (new): {mask.shape}")


Shape of the causal mask (new): torch.Size([128, 128])


In [5]:
import torch.optim as optim
import wandb
from tqdm.notebook import tqdm
import torch.nn.utils as nn_utils

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

wandb.config.update({
    "embed_size": model.embed_size,
    "num_layers": model.num_layers,
    "heads": model.nhead,
    "dropout": model.dropout,
    "forward_expansion": forward_expansion,
    "max_len": model.max_len,
    "batch_size": train_dataloader.batch_size,
    "learning_rate": 1e-4,
    "optimizer": "AdamW"
})

def train_epoch(model, dataloader, optimizer, criterion, device, epoch):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1} Training")
    for batch_idx, (data, targets) in enumerate(progress_bar):
        data = data.to(device)
        targets = targets.to(device)

        seq_len = data.shape[1]
        mask = create_causal_mask(seq_len, device)

        outputs = model(data, mask)

        loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))

        optimizer.zero_grad()
        loss.backward()
        nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
        wandb.log({"train_loss_step": loss.item()})

    avg_loss = total_loss / len(dataloader)
    wandb.log({"train_loss_epoch": avg_loss, "epoch": epoch + 1})
    return avg_loss

def evaluate_epoch(model, dataloader, criterion, device, epoch):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1} Evaluation")
        for batch_idx, (data, targets) in enumerate(progress_bar):
            data = data.to(device)
            targets = targets.to(device)

            seq_len = data.shape[1]
            mask = create_causal_mask(seq_len, device)

            outputs = model(data, mask)

            loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))

            total_loss += loss.item()
            progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})
            wandb.log({"val_loss_step": loss.item()})

    avg_loss = total_loss / len(dataloader)
    wandb.log({"val_loss_epoch": avg_loss, "epoch": epoch + 1})
    return avg_loss

In [6]:
num_epochs = 5

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device, epoch)
    val_loss = evaluate_epoch(model, val_dataloader, criterion, device, epoch)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

# Finalizar el seguimiento de wandb
wandb.finish()

Epoch 1 Training:   0%|          | 0/9504 [00:00<?, ?it/s]

Epoch 1 Evaluation:   0%|          | 0/1056 [00:00<?, ?it/s]

Epoch 1: Train Loss = 6.9440, Val Loss = 5.9973


Epoch 2 Training:   0%|          | 0/9504 [00:00<?, ?it/s]

Epoch 2 Evaluation:   0%|          | 0/1056 [00:00<?, ?it/s]

Epoch 2: Train Loss = 5.8031, Val Loss = 5.5285


Epoch 3 Training:   0%|          | 0/9504 [00:00<?, ?it/s]

Epoch 3 Evaluation:   0%|          | 0/1056 [00:00<?, ?it/s]

Epoch 3: Train Loss = 5.4300, Val Loss = 5.1615


Epoch 4 Training:   0%|          | 0/9504 [00:00<?, ?it/s]

Epoch 4 Evaluation:   0%|          | 0/1056 [00:00<?, ?it/s]

Epoch 4: Train Loss = 5.1417, Val Loss = 4.9110


Epoch 5 Training:   0%|          | 0/9504 [00:00<?, ?it/s]

Epoch 5 Evaluation:   0%|          | 0/1056 [00:00<?, ?it/s]

Epoch 5: Train Loss = 4.9572, Val Loss = 4.7580


0,1
epoch,▁▁▃▃▅▅▆▆██
train_loss_epoch,█▄▃▂▁
train_loss_step,█▄▄▄▄▄▃▃▃▃▃▃▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▂▁▁▁▁▁▁
val_loss_epoch,█▅▃▂▁
val_loss_step,██▇██▇████▅▅▅▆▆▅▆▅▄▃▄▄▃▄▃▄▄▃▂▂▂▁▂▂▂▂▁▂▁▁

0,1
epoch,5.0
train_loss_epoch,4.95724
train_loss_step,4.84831
val_loss_epoch,4.75796
val_loss_step,4.92963


In [7]:
# Pon el modelo en modo de evaluación
model.eval()

# Elige un punto de inicio para la generación (un token o una secuencia corta)
# Puedes usar el ID de un token específico o codificar una pequeña frase.
start_text = "The "
start_tokens = tokenizer.encode(start_text, return_tensors="pt").to(model.device)

# Genera una secuencia de tokens
max_new_tokens = 100
generated_tokens = model.generate(start_tokens, max_new_tokens=max_new_tokens)

# Decodifica los tokens generados a texto
generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

print(f"Texto generado a partir de '{start_text}':\n{generated_text}")

# Vuelve a poner el modelo en modo de entrenamiento
model.train()

Texto generado a partir de 'The ':
The ,
And,
And,
And I'll be I'll be I'll be I
And
And
And
And
And
And
And
And
And
And
And I'll be I'll be I'll be I'll be I'll be I'll be I'll BOLINGHAM:
And
And
And
And
And
And
And
And
And
And
And
And
And
And I'll be I'll be


DecoderOnlyTransformer(
  (embedding): Embedding(50257, 8)
  (position_embedding): Embedding(128, 8)
  (dropout_layer): Dropout(p=0.1, inplace=False)
  (transformer_decoder): TransformerDecoder(
    (layers): ModuleList(
      (0): TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
        )
        (linear1): Linear(in_features=8, out_features=8, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=8, out_features=8, bias=True)
        (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
  