# GPT Techniques Examples
This notebook demonstrates 10 key techniques implemented using GPT-style models in PyTorch.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


## 1. Instruction Tuning with GPT

In [None]:
class GPTInstruction(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(GPTInstruction, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])

# Example vocabulary and encoding
vocab = {word: i for i, word in enumerate("Translate Hello to French World Bonjour Monde".split())}
def encode(sentence):
    return torch.tensor([vocab[word] for word in sentence.split() if word in vocab])

model = GPTInstruction(len(vocab), embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

data = [
    ("Translate: 'Hello' to French.", "Bonjour"),
    ("Translate: 'World' to French.", "Monde"),
]

# Training Loop
for epoch in range(10):
    total_loss = 0
    for instruction, response in data:
        inputs = encode(instruction).unsqueeze(0)
        targets = encode(response)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

test_input = encode("Translate: 'Hello' to French.").unsqueeze(0)
output = model(test_input)
predicted = torch.argmax(output, dim=1)
decoded = {idx: word for word, idx in vocab.items()}
print(f"Predicted: {[decoded[idx.item()] for idx in predicted]}")

## 2. Self-Consistency Decoding with GPT

In [None]:
data = [
    ("What is the capital of France?", "Paris"),
    ("Capital of France?", "Paris"),
    ("France's capital?", "Paris"),
]

class GPTSelfConsistency(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(GPTSelfConsistency, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])

model = GPTSelfConsistency(len(vocab), embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(10):
    total_loss = 0
    for question, answer in data:
        inputs = encode(question).unsqueeze(0)
        targets = encode(answer)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

test_question = encode("What is the capital of France?").unsqueeze(0)
outputs = [torch.argmax(model(test_question), dim=1) for _ in range(3)]
decoded = {idx: word for word, idx in vocab.items()}
responses = ["".join([decoded[idx.item()] for idx in output]) for output in outputs]
most_consistent = max(set(responses), key=responses.count)
print(f"Responses: {responses}\nMost Consistent: {most_consistent}")