<a href="https://colab.research.google.com/github/sathya8998/GPT3/blob/main/Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install torch-xla



In [None]:
!pip install langid



In [None]:
import traceback
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset
import os
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl
import langid
import logging  # Added import for logging

# Constants
BATCH_SIZE = 64
BLOCK_SIZE = 256
MAX_ITERS = 5000
EVAL_INTERVAL = 500
LEARNING_RATE = 3e-4
DEVICE = xm.xla_device()
EVAL_ITERS = 200
N_EMBD = 384
N_HEAD = 6
N_LAYER = 6
DROPOUT = 0.2
# Set random seed for reproducibility
torch.manual_seed(1337)

# Reading text data
with open('/content/Data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Tokenize the text into sentences or smaller chunks (adjust as needed)
sentences = [sentence.strip() for sentence in text.split('.')]
all_tokens = [token for sentence in sentences for token in sentence.split()]
english_german_vocab = set(all_tokens)
VOCAB_SIZE = len(english_german_vocab)

# Update BLOCK_SIZE based on a fixed value
BLOCK_SIZE = 256
print("Updated BLOCK_SIZE:", BLOCK_SIZE)

def encode_multilingual(s, vocab_size, lang, max_len=BLOCK_SIZE):
    if lang == 'de':
        return [(ord(c) % vocab_size) + vocab_size for c in s[:max_len] if (ord(c) % vocab_size) + vocab_size < vocab_size] + [0] * (max_len - len(s))
    else:
        return [ord(c) % vocab_size for c in s[:max_len] if ord(c) % vocab_size < vocab_size] + [0] * (max_len - len(s))

# Update the vocab size to include characters from both languages
VOCAB_SIZE = len(english_german_vocab)

# Convert text to tokens and pad sequences
tokenized_text = [encode_multilingual(token, VOCAB_SIZE, 'en') for token in all_tokens]
max_len = BLOCK_SIZE
padded_text = [seq + [0] * (max_len - len(seq)) for seq in tokenized_text]

# Data loading
data = torch.tensor(padded_text, dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Create DataLoader for training set
train_dataset = TensorDataset(train_data[:-1], train_data[1:])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Create DataLoader for validation set
val_dataset = TensorDataset(val_data[:-1], val_data[1:])
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Function to decode the generated tokens into a string
def decode_multilingual(tokens):
    return "".join(chr(token) for token in tokens)

# Modified GPT Language Model for chat-like responses
class Block(nn.Module):
    def __init__(self, n_embd, n_head=8):
        super().__init__()
        self.ln_1 = nn.LayerNorm(n_embd)
        self.attn = nn.MultiheadAttention(n_embd, n_head)
        self.ln_2 = nn.LayerNorm(n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(DROPOUT),
        )

    def forward(self, x):
        x = x + self.attn(self.ln_1(x), x, x)[0]
        x = x + self.mlp(self.ln_2(x))
        return x

class ChatGPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, N_EMBD)
        self.position_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
        self.blocks = nn.Sequential(*[Block(N_EMBD, n_head=N_HEAD) for _ in range(N_LAYER)])
        self.ln_f = nn.LayerNorm(N_EMBD)
        self.lm_head = nn.Linear(N_EMBD, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=DEVICE))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -BLOCK_SIZE:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

    def respond_to_question(self, question):
        # Detect language of the question
        lang, _ = langid.classify(question)

        # Tokenize the user's question and convert it to model input
        if lang == 'de':
            user_input = torch.tensor(encode_multilingual_german(question, VOCAB_SIZE, lang), dtype=torch.long).unsqueeze(0).to(DEVICE)
        else:
            user_input = torch.tensor(encode_multilingual_english(question, VOCAB_SIZE, lang), dtype=torch.long).unsqueeze(0).to(DEVICE)

        # Generate response based on the user's input
        response = self.generate(user_input, max_new_tokens=50)

        # Decode and return the response
        return decode_multilingual(response[0].tolist())

# Create model and move to device
model = ChatGPTLanguageModel(VOCAB_SIZE)
model = model.to(DEVICE)

# Print the number of parameters in the model
print(sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')

# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Wrapping DataLoader for TPU
train_loader = pl.MpDeviceLoader(train_loader, DEVICE)

# Wrap model for parallel training
model_parallel = xmp.MpModelWrapper(model)

# Setup logging
logging.basicConfig(filename='training.log', level=logging.INFO)

def estimate_loss(model, dataloader, device):
    if isinstance(model, xmp.MpModelWrapper):
        # Retrieve the underlying model from the model parallel wrapper
        model = model._model

    model.eval()  # Evaluate the underlying model
    total_loss = 0.0
    total_batches = 0

    with torch.no_grad():
        for xb, yb in dataloader:
            xb, yb = xb.to(device), yb.to(device)

            # Detect language of the batch
            lang, _ = langid.classify(decode_multilingual(xb.flatten()))

            # Ensure indices are within the vocabulary size
            if (xb >= model.token_embedding_table.weight.shape[0]).any() or (
                yb >= model.token_embedding_table.weight.shape[0]
            ).any():
                logging.warning("Index out of range in embedding. Skipping batch.")
                continue

            logits, loss = model(xb, yb)

            total_loss += loss.item()
            total_batches += 1

    return total_loss / total_batches

for iteration in range(MAX_ITERS):
    try:
        # Every once in a while evaluate the loss on train and val sets
        if iteration % EVAL_INTERVAL == 0 or iteration == MAX_ITERS - 1:
            losses = estimate_loss(model_parallel, train_loader, DEVICE)
            logging.info(f"step {iteration}: train loss {losses:.4f}")

        # Sample a batch of data
        xb, yb = next(iter(train_loader))

        # Detect language of the batch
        lang, _ = langid.classify(decode_multilingual(xb.flatten()))

        # Ensure indices are within the vocabulary size
        xb = encode_multilingual(xb.flatten(), VOCAB_SIZE, lang).view_as(xb)
        yb = encode_multilingual(yb.flatten(), VOCAB_SIZE, lang).view_as(yb)

        # Evaluate the loss
        logits, loss = model_parallel(xb, yb)
        optimizer.zero_grad()
        loss.backward()
        xm.optimizer_step(optimizer)

    except Exception as e:
        logging.error(f"Exception at step {iteration}: {e}")
        traceback.print_exc()

# After the loop, print the maximum index in the entire dataset
max_index_dataset = torch.max(data).item()
print("Max index in the entire dataset:", max_index_dataset)

# Print the maximum index in the embeddings
max_index_embeddings = model.token_embedding_table.weight.shape[0] - 1
print("Max index in the embeddings:", max_index_embeddings)

# Example of responding to a user's question
user_question_german = "Wie effizient sind unsere jetzigen und zukünftigen Optimierungsmaßnahmen?"
user_question_english = "How efficient are our current and future optimization measures?"

# Use the original model for the German question
response_original_german = decode_multilingual(model.respond_to_question(user_question_german)[0].tolist())
print("Original model response (German):", response_original_german)

# Use the model parallel wrapper for the English question
response_parallel_english = decode_multilingual(model_parallel.respond_to_question(user_question_english)[0].tolist())
print("Parallel model response (English):", response_parallel_english)


Updated BLOCK_SIZE: 256
13.94182 M parameters


KeyboardInterrupt: 