In [None]:
# developer Mujtaba Ghulami for learn multihead attention model and sample PositionalEncoding

In [None]:
%%capture
!pip install datasets transformers
!pip install torchinfo
!pip install torchviz

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
#from torchviz import make_dot
#from torchinfo import summary
from transformers import GPT2Tokenizer
import torch.nn.functional as F
import time
import pandas as pd
from datasets import Dataset

In [None]:
# Positional Encoding Module
class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model):
        super(PositionalEncoding, self).__init__()
        # Create constant 'pe' matrix with values dependent on position and dimension.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)  # even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # odd indices
        pe = pe.unsqueeze(0)  # Add batch dimension
        # Register pe as a buffer so it is part of the module state but not a parameter.
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (batch, seq_len, d_model)
        seq_len = x.size(1)
        x = x + self.pe[:, :seq_len, :]
        return x

# Causal Self-Attention
class MatrixModel(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super(MatrixModel, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        self.mha = nn.MultiheadAttention(embed_dim, num_heads,batch_first=True)

        # Feed-forward network following attention
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.SiLU(), #nn.GELU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)


    def forward(self, inputs):
        # inputs shape: (batch, seq_len, embed_dim)
        attn_output, _ = self.mha(inputs, inputs, inputs, attn_mask=self._generate_causal_mask(inputs.size(1)).to(inputs.device))

        # Residual connection and layer normalization
        out1 = self.layernorm1(inputs + attn_output)
        # Pass through feed-forward network
        ffn_output = self.ffn(out1)

        output = self.layernorm2(out1 + ffn_output)
        return output

    def _generate_causal_mask(self, seq_len):
        return torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()



# Full Brain-Inspired Model Module (now a Causal Language Model)
class MatrixGPT(nn.Module):
    def __init__(self, vocab_size, max_length, embed_dim, num_layers,
                 num_heads, ff_dim):
        super(MatrixGPT, self).__init__()
        self.max_length = max_length
        self.embed_dim = embed_dim

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        # Positional encoding layer
        self.positional_encoding = PositionalEncoding(max_length, embed_dim)

        # List of Causal Self-Attention layers
        self.MatrixModel_layers = nn.ModuleList([
            MatrixModel(embed_dim, num_heads, ff_dim)
            for _ in range(num_layers)
        ])

        self.output_layer = nn.Linear(self.embed_dim, vocab_size)
    def forward(self, x):
        # x shape: (batch, seq_len)
        x = self.embedding(x)  # (batch, seq_len, embed_dim)
        x= self.positional_encoding(x)

        for attn in self.MatrixModel_layers:
            x= attn(x)

        # Output layer; for language modeling, output logits over vocab for each token.
        logits = self.output_layer(x)
        # For classification we often apply softmax externally (e.g., in loss function)
        return logits



In [None]:
vocab_size = 50259
max_length = 1024
embed_dim = 1536
num_layers = 16   # Increase depth for better representation
num_heads = 8
ff_dim = 6144
model_path= "/kaggle/working/MatrixGPT.pth" #"/kaggle/input/matrix/MatrixGPT.pth" #"/content/drive/MyDrive/brain_p/MatrixGPT.pth" # "/kaggle/working/MatrixGPT.pth"
save_path = "/kaggle/working/MatrixGPT.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
%%capture
model = MatrixGPT(
    vocab_size=vocab_size,
    max_length=max_length,
    embed_dim=embed_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    ff_dim=ff_dim,
)

In [None]:
%%capture
if torch.cuda.device_count() > 1:
    #print("Let's use", torch.cuda.device_count(), "GPUs!")
    #model = nn.DataParallel(model)
    pass

model.to(device)

In [None]:
# # More efficient for multi-GPU
# from torch.nn.parallel import DistributedDataParallel as DDP

# # Setup (more complex, but worth it)
# torch.distributed.init_process_group(backend='nccl')
# local_rank = int(os.environ["LOCAL_RANK"])
# model = model.to(local_rank)
# model = DDP(model, device_ids=[local_rank])
# if isinstance(model, nn.DataParallel):
#     original_model = model.module
# else:
#     original_model = model

# # Save
# torch.save(model.module.state_dict() if isinstance(model, nn.DataParallel)
#            else model.state_dict(), "checkpoint.pth")

In [None]:
%%capture
# Define header tokens
START_HEADER = "<|startheader|>"
END_HEADER = "<|endheader|>"

# Load the dataset

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Set up padding and end-of-text token
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding

# Add special tokens: ensure EOS and header tokens are added
special_tokens_dict = {
    "eos_token": "<|endoftext|>",
    "additional_special_tokens": [START_HEADER, END_HEADER]
}
tokenizer.add_special_tokens(special_tokens_dict)

def _return_header(message) -> str:
    role = message.get("from", "")
    if role == "system":
        return "system:"
    elif role == "gpt":
        return "assistant:"
    elif role == "human":
        return "user:"
    return "unknown:"

def encode_header(message):
    header = _return_header(message)
    # Wrap the header text with start and end header tokens
    return f"{START_HEADER}{header}{END_HEADER}"

def encode_message(message) -> str:
    text = encode_header(message)
    text += message["value"].strip()
    text += "<|endoftext|>"  # Append the correct end-of-text token
    return text

def encode_dialog_prompt(dialog):
    # Concatenate all messages in the dialog into one string.
    return "".join(encode_message(message) for message in dialog)

def hermes_ins(batch):
    # Encode the conversation in each batch item
    texts = [encode_dialog_prompt(item['conversations']) for item in batch]
    tokenized = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,  # You may change padding behavior if desired
        truncation=True,
        max_length=max_length + 1  # Increased max_length by 1 to account for labels
    )
    input_ids = tokenized["input_ids"].long()
    inputs = input_ids[:, :-1]
    labels = input_ids[:, 1:]
    return {"input_ids": inputs, "labels": labels, "text": texts}

# Create DataLoader


In [None]:
%%capture
#!wget https://huggingface.co/datasets/teknium/OpenHermes-2.5/resolve/main/openhermes2_5.json

In [None]:
# Load the JSON data using pandas
#df = pd.read_json("openhermes2_5.json")

# Create a Dataset from the pandas DataFrame
#OpenHermes = Dataset.from_pandas(df)

In [None]:
OpenHermes = load_dataset("teknium/OpenHermes-2.5", split='train')
hermes_instruct = DataLoader(OpenHermes, batch_size=1, shuffle=True, collate_fn=hermes_ins)

In [None]:
def save_all(model, optimizer, loss, save_path):
    # Handle wrapped models (DDP/DataParallel)
    model_state = model.module.state_dict() if hasattr(model, 'module') else model.state_dict()

    checkpoint = {
        'model_state_dict': model_state,
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }
    torch.save(checkpoint, save_path)
    time.sleep(7)

In [None]:
# Initialize once (don't re-initialize if already done)
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

In [None]:
# %%capture
# # Loading
# checkpoint = torch.load(model_path, map_location=device)

# # Handle wrapped models when loading
# if hasattr(model, 'module'):
#     model.module.load_state_dict(checkpoint['model_state_dict'])
# else:
#     model.load_state_dict(checkpoint['model_state_dict'])

# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# loss = checkpoint['loss']
# del checkpoint

In [None]:
str(loss.item())

'0.4338739812374115'

In [None]:
from transformers import get_cosine_schedule_with_warmup
import torch.nn.utils as nn_utils


# Setup
num_epochs = 10
save_every = 3500
total_steps = num_epochs * len(hermes_instruct)
warmup_steps = int(0.1 * total_steps)

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Gradient accumulation settings
accum_steps = 16
effective_batch_size = 1 * accum_steps
print(f"Simulating effective batch size: {effective_batch_size}")

# Training loop
global_step = 0
saved = 0
model.train()

for epoch in range(num_epochs):
    optimizer.zero_grad()  # Clear at start of epoch

    for batch_idx, batch in enumerate(hermes_instruct):
        inputs = batch["input_ids"].to(device)
        targets = batch["labels"].to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

        # Scale loss for gradient accumulation
        loss = loss / accum_steps
        loss.backward()  # Accumulate gradients

        # Perform optimization step after accum_steps batches
        if (batch_idx + 1) % accum_steps == 0:
            # Clip gradients before optimizer step
            nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            # Display unscaled loss
            actual_loss = loss.item() * accum_steps
            current_lr = scheduler.get_last_lr()[0]
            print(f"\rEpoch: {epoch+1}/{num_epochs} | "
                  f"Batch: {batch_idx+1}/{len(hermes_instruct)} | "
                  f"Loss: {actual_loss:.4f} | "
                  f"LR: {current_lr:.2e} | "
                  f"Saved: {saved}", end="")

        # Increment global step counter
        global_step += 1

        # Save checkpoint every save_every steps
        if global_step % save_every == 0:
            # Apply any remaining accumulated gradients before saving
            if (batch_idx + 1) % accum_steps != 0:
                nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            actual_loss = loss.item() * accum_steps
            save_all(model, optimizer, f"{actual_loss:.4f}",save_path)
            saved += 1
            print(f"\n✓ Model saved at step {global_step}")

            if saved % 3 == 0:
                # upload()  # Optional cloud backup
                pass

    # Handle remaining gradients at end of epoch
    if (batch_idx + 1) % accum_steps != 0:
        nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print(f"\nEpoch {epoch+1} completed")

Loss: 6.279 epoch: 1 saved: 3 cycle: 937			

In [None]:
# from transformers import get_cosine_schedule_with_warmup
# import torch.nn.utils as nn_utils

# # Setup
# num_epochs = 10
# save_every = 3500
# total_steps = num_epochs * len(hermes_instruct)
# warmup_steps = int(0.1 * total_steps)

# scheduler = get_cosine_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=warmup_steps,
#     num_training_steps=total_steps
# )

# # Training
# global_step = 0
# saved = 0

# for epoch in range(num_epochs):
#     model.train()
#     epoch_loss = 0

#     for batch_idx, batch in enumerate(hermes_instruct):
#         inputs = batch["input_ids"].to(device)
#         targets = batch["labels"].to(device)

#         # Forward pass
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))

#         # Backward pass
#         loss.backward()
#         nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
#         optimizer.step()
#         scheduler.step()

#         # Track metrics
#         global_step += 1
#         epoch_loss += loss.item()

#         # Display progress
#         print(f"\rEpoch: {epoch+1}/{num_epochs} | "
#               f"Batch: {batch_idx+1}/{len(hermes_instruct)} | "
#               f"Loss: {loss.item():.4f} | "
#               f"LR: {scheduler.get_last_lr()[0]:.2e} | "
#               f"Saved: {saved}", end="")

#         # Periodic saving
#         if global_step % save_every == 0:
#             save_all(model, optimizer, loss,save_path)
#             saved += 1
#             print(f"\n✓ Model saved at step {global_step}")

#             if saved % 3 == 0:
#                 # upload()  # Optional cloud upload
#                 pass

#     # Epoch summary
#     avg_loss = epoch_loss / len(hermes_instruct)
#     print(f"\nEpoch {epoch+1} completed | Avg Loss: {avg_loss:.4f}")

In [None]:
def top_k_sampling(logits, k):
    """
    Select the next token using top-k sampling.
    Args:
        logits (Tensor): Logits for the current token with shape [vocab_size].
        k (int): The number of top tokens to sample from.
    Returns:
        int: The token id sampled from the top-k distribution.
    """
    # Apply softmax to get probabilities.
    probabilities = F.softmax(logits, dim=-1)
    # Get the top-k token ids and their probabilities.
    topk_probs, topk_indices = torch.topk(probabilities, k)
    # Normalize the top-k probabilities.
    topk_probs = topk_probs / torch.sum(topk_probs)
    # Sample one token id from the top-k distribution.
    next_token_id = torch.multinomial(topk_probs, 1).item()
    # Get the corresponding token id from topk_indices.
    return topk_indices[next_token_id].item()

def generate_text_k(model, tokenizer, input_text,device, max_length=50, k=10):
    model.eval()
    # Tokenize the input text.
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    generated = input_ids.tolist()[0]

    with torch.no_grad():
        for _ in range(max_length):
            # Use only the last token as input along with the previous hidden state.
            input_token = torch.tensor([[generated[-1]]]).to(device)
            logits = model(input_token)
            # Get logits for the last token (shape: [1, 1, vocab_size]) and remove unneeded dimensions.
            next_token_logits = logits[:, -1, :].squeeze(0) # Corrected indexing

            # Sample the next token using top-k sampling.
            next_token_id = top_k_sampling(next_token_logits, k)
            generated.append(next_token_id)

            # Optionally, stop generation if the end-of-sequence token is generated.
            if tokenizer.eos_token_id is not None and next_token_id == tokenizer.eos_token_id:
                break

    # Decode the complete generated token list.
    generated_text = tokenizer.decode(generated, skip_special_tokens=True)
    return generated_text

def generate_text(model, tokenizer, input_text,device, max_length=50):
    model.eval()
    # Tokenize the input text.
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    generated = input_ids.tolist()[0]

    with torch.no_grad():
        for _ in range(max_length):
            # Provide the complete sequence each time to help the model consider context.
            input_ids_tensor = torch.tensor([generated]).to(device)

            # Get probability distribution for the next token.
            logits = model(input_ids_tensor)
            last_token_logits = logits[:, -1, :]  # shape (batch, vocab_size) # Corrected indexing
            probabilities = torch.softmax(last_token_logits, dim=-1)
            # Greedy sampling: choose the token with the highest probability.
            next_token_id = torch.argmax(probabilities, dim=-1).item()
            generated.append(next_token_id)

            # Stop generation if the EOS token is produced.
            if next_token_id == tokenizer.eos_token_id:
                break

    generated_text = tokenizer.decode(generated, skip_special_tokens=True)
    return generated_text

class TopPTextGenerator:
    """
    A class to perform text generation using nucleus (top-p) sampling.
    """
    def __init__(self, model, tokenizer, top_p=0.9, temperature=1.0, device=None):
        """
        model: PyTorch module that returns logits of shape [batch_size, seq_length, vocab_size]
        tokenizer: A tokenizer with encode/decode methods and an eos_token_id attribute.
        top_p: The cumulative probability threshold for nucleus sampling.
        temperature: A factor to control randomness; higher values increase randomness.
        device: torch.device to use.
        """
        self.model = model
        self.tokenizer = tokenizer
        self.top_p = top_p
        self.temperature = temperature
        self.device = device

    def nucleus_sampling(self, logits):
        """
        Applies nucleus (top-p) filtering to the logits.
        logits: Tensor of shape [vocab_size] representing logits for the next token.
        Returns the logits with values filtered out that do not belong to the top-p cumulative distribution.
        """
        # Apply temperature scaling
        logits = logits / self.temperature

        # Compute probabilities from logits
        probs = F.softmax(logits, dim=-1)

        # Sort the probabilities in descending order
        sorted_probs, sorted_indices = torch.sort(probs, descending=True)

        # Compute cumulative probabilities of the sorted tensor
        cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

        # Create a mask to filter out tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > self.top_p

        # Shift the mask one token to the right to keep the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Prepare an output copy of logits to modify
        filtered_logits = logits.clone()
        # Get the indices to remove from the sorted token indices
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        filtered_logits[indices_to_remove] = -float('Inf')
        return filtered_logits

    def generate(self, prompt, seq_len=50):
        """
        Generates text conditioned on a prompt.

        prompt: Starting text string.
        seq_len: Maximum number of tokens to generate.
        Returns the generated text string.
        """
        self.model.eval()
        # Encode the prompt using the GPT-2 tokenizer.
        token_ids = self.tokenizer.encode(prompt)
        input_ids = torch.tensor(token_ids, dtype=torch.long, device=self.device).unsqueeze(0)  # shape: [1, seq_length]

        with torch.no_grad():
            for _ in range(seq_len):
                # Get logits from the model (assuming model returns logits for every token)
                logits = self.model(input_ids)  # shape: [1, current_seq_len, vocab_size]
                next_token_logits = logits[0, -1, :]  # shape: [vocab_size] # Corrected indexing

                # Apply nucleus sampling filtering to logits
                filtered_logits = self.nucleus_sampling(next_token_logits)

                # Convert filtered logits to probabilities and sample the next token
                probs = F.softmax(filtered_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)

                # Append the sample to the sequence
                input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1)

                # If we hit the end-of-sequence token, stop early.
                if self.tokenizer.eos_token_id and next_token.item() == self.tokenizer.eos_token_id:
                    break

        output_text = self.tokenizer.decode(input_ids.squeeze().tolist(), skip_special_tokens=True)
        return output_text


In [None]:
generator = TopPTextGenerator(model, tokenizer, top_p=0.9, temperature=1.0, device=device)

In [None]:
prompt="where is United States"
system="<|startheader|>system:<|endheader|>You are an AI assistant. You will be given a task. You must generate a detailed and long answer.<|endoftext|>"
input_text = f"{system}<|startheader|>user:<|endheader|>{prompt}<|endoftext|><|startheader|>assistant:<|endheader|>"

In [None]:
generated_text = generator.generate(input_text, seq_len=20)
print("Generated Text (Top-P):")
print(generated_text)

Generated Text (Top-P):
system:You are an AI assistant. You will be given a task. You must generate a detailed and long answer.user:where is United Statesassistant:useruserusersystemuserusersystemsystemsystemusersystemsystemuserusersystemuseruseruserusersystem


In [None]:
generated = generate_text_k(model, tokenizer, input_text,device, max_length=10, k=10)
print("Generated text (Top-K):\n", generated)

Generated text (Top-K):
 system:You are an AI assistant. You will be given a task. You must generate a detailed and long answer.user:where is United Statesassistant:usersystemuseruseruseruserusersystemsystemuser


In [None]:
generated = generate_text(model, tokenizer, input_text,device, max_length=10)
print("Generated text (Greedy):\n", generated)

Generated text (Greedy):
 system:You are an AI assistant. You will be given a task. You must generate a detailed and long answer.user:where is United Statesassistant:useruseruseruseruseruseruseruseruseruser


In [None]:
c=0
for i in hermes_instruct:
    #print(i['input_ids'][1])
    #print(i['labels'][1])
    #text2= tokenizer.decode(i['input_ids'][0], skip_special_tokens=False)
    #print(text2)
    print("--------------------------------------------------")
    #text=tokenizer.decode(i['labels'][0], skip_special_tokens=False)
    #print(text)
    #print(f"\r{c++}",end="")
    print(i["text"])
    break

--------------------------------------------------


In [None]:
tokenizer.decode(tokenizer.encode(input_text),skip_special_tokens=False)

In [None]:
tokenizer.decode(50258)