In [None]:

import torch
import torch.nn as nn
import torch.optim as optim

# Dummy dataset (instruction-response pairs)
data = [
    ("Translate: 'Hello' to French.", "Bonjour"),
    ("Translate: 'World' to French.", "Monde"),
]

# Vocabulary and encoding
vocab = {word: i for i, word in enumerate("Translate Hello to French World Bonjour Monde".split())}
def encode(sentence):
    return torch.tensor([vocab[word] for word in sentence.split() if word in vocab])

# GPT-like Transformer Model
class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])

# Initialize model
model = GPT(len(vocab), embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    total_loss = 0
    for instruction, response in data:
        inputs = encode(instruction).unsqueeze(0)
        targets = encode(response)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Test
test_input = encode("Translate: 'Hello' to French.").unsqueeze(0)
output = model(test_input)
predicted = torch.argmax(output, dim=1)
decoded = {idx: word for word, idx in vocab.items()}
print(f"Predicted: {[decoded[idx.item()] for idx in predicted]}")



In [None]:

Self-Consistency Decoding with a Transformer
Generate multiple outputs and evaluate their consistency.


In [None]:

import torch
import torch.nn as nn
import torch.optim as optim

# Dummy dataset
data = [
    ("What is the capital of France?", "Paris"),
    ("Capital of France?", "Paris"),
    ("France's capital?", "Paris"),
]

# Vocabulary and encoding
vocab = {word: i for i, word in enumerate("What is the capital of France Paris".split())}
def encode(sentence):
    return torch.tensor([vocab[word] for word in sentence.split() if word in vocab])

# GPT Model
class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])

# Initialize model
model = GPT(len(vocab), embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    total_loss = 0
    for question, answer in data:
        inputs = encode(question).unsqueeze(0)
        targets = encode(answer)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Generate multiple responses for consistency
test_question = encode("What is the capital of France?").unsqueeze(0)
outputs = [torch.argmax(model(test_question), dim=1) for _ in range(3)]

# Decode responses
decoded = {idx: word for word, idx in vocab.items()}
responses = ["".join([decoded[idx.item()] for idx in output]) for output in outputs]

# Find most consistent
most_consistent = max(set(responses), key=responses.count)
print(f"Responses: {responses}\nMost Consistent: {most_consistent}")



In [None]:

Chain of Thought (CoT) Reasoning with GPT


In [None]:

import torch
import torch.nn as nn
import torch.optim as optim

# Dummy dataset for reasoning
data = [
    ("Solve: (2+3) * 4", "20"),
    ("Solve: (3+5) * 2", "16"),
]

# Vocabulary and encoding
vocab = {word: i for i, word in enumerate("Solve 2 3 4 5 + * 20 16".split())}
def encode(sentence):
    return torch.tensor([vocab[word] for word in sentence.split() if word in vocab])

# GPT Model
class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])

# Initialize model
model = GPT(len(vocab), embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train
for epoch in range(10):
    total_loss = 0
    for question, answer in data:
        inputs = encode(question).unsqueeze(0)
        targets = encode(answer)
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Test with reasoning steps
test_input = encode("Solve: (2+3) * 4").unsqueeze(0)
output = model(test_input)
predicted = torch.argmax(output, dim=1)
decoded = {idx: word for word, idx in vocab.items()}
print(f"Predicted: {[decoded[idx.item()] for idx in predicted]}")



In [None]:

Mixture of Experts (MoE) with Transformers


In [None]:

import torch

# Define two simple transformer models as experts
class ExpertModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(ExpertModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])

# Create two experts
vocab_size = 50
expert_1 = ExpertModel(vocab_size, embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)
expert_2 = ExpertModel(vocab_size, embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)

# Dummy routing logic
def route(task_type, input_tensor):
    if task_type == "task_1":
        return expert_1(input_tensor)
    elif task_type == "task_2":
        return expert_2(input_tensor)
    else:
        raise ValueError("Unknown task type")

# Simulate input and route to the correct expert
dummy_input = torch.randint(0, vocab_size, (1, 10))
task_type = "task_1"
output = route(task_type, dummy_input)
print(f"Output Shape: {output.shape}")



In [None]:

Retrieval-Augmented Generation (RAG) with Transformers


In [None]:

import torch
import torch.nn as nn

# Dummy database and retrieval
documents = [
    "The Eiffel Tower is located in Paris.",
    "Paris is the capital of France.",
    "France is in Europe."
]
query = "Where is the Eiffel Tower?"

# Retrieve relevant document
retrieved_doc = next((doc for doc in documents if "Eiffel Tower" in doc), "No relevant document found.")

# Combine query and retrieved document as input
combined_input = f"Query: {query}\nContext: {retrieved_doc}"

# Simple transformer for generation
class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])

# Example combined input
vocab = {word: i for i, word in enumerate("Query Context Eiffel Tower Paris".split())}
input_ids = torch.tensor([[vocab[word] for word in combined_input.split() if word in vocab]])

# Initialize and pass through model
model = GPT(len(vocab), embed_size=16, num_heads=2, num_layers=2, hidden_dim=64)
output = model(input_ids)
print(f"Output Shape: {output.shape}")

