In [9]:
import json
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  

conversation = []

# Function to process a single file
def process_file(file):
  conversation = []
  with open(file, "r", encoding="utf-8") as f:
    data = json.load(f)

    for item in data:
      chat = item["title"]
      for x, y in item["mapping"].items():
        if y["message"] is not None and y["message"]["author"]["role"] != "system":
          try:
            if len(y['message']['content']['parts'][0]) > 0:
              chat += f" {y['message']['author']['role'].capitalize()}: {y['message']['content']['parts'][0]}"
          except Exception as e:
            pass
      conversation.append(chat)
    return conversation

# Function to combine results from all files
def process_all_files(files):
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        results = list(tqdm(executor.map(process_file, files), total=len(files)))

    # Combine results into a single conversation list
    all_conversations = []
    for result in results:
        all_conversations.extend(result)

    return all_conversations

# Get the list of all files
files = [os.path.join("chatgpt_dataset", file) for file in os.listdir("chatgpt_dataset")]

# Process the files and get the conversation data
conversation = process_all_files(files)

len(conversation)

100%|██████████| 3/3 [00:00<?, ?it/s]


1088

In [12]:
# from huggingface_hub import login
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
"pt-sk/ll-3.2-1B_Instruct",
)

In [15]:
import concurrent.futures

# Define the tokenization function
def tokenize_text(text):
    return tokenizer.encode(text)

# Use ThreadPoolExecutor to tokenize texts in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Map the tokenize_text function to each string in texts
    tokenized_texts = list(executor.map(tokenize_text, conversation))

In [16]:
len(tokenized_texts)

1088

In [17]:
from itertools import chain

tokens = list(chain.from_iterable(tokenized_texts))

len(tokens)

1902932

In [8]:
import numpy as np

# Save the tokenized texts to a numpy file
# file = np.array(tokens)
# np.save("conversation_tokens.npy", file)

In [9]:
file_loaded = np.load("conversation_tokens.npy", allow_pickle=True)
file_loaded[1:10]

array([ 4178,    44,  5075, 31754, 40283,  5468,    51, 22312,  2724])

In [10]:
len(file_loaded)

1902932

In [None]:
# file_name = "pt-sk/chatgpt-dataset"

# from huggingface_hub import HfApi, login
# login(token)

# api = HfApi()
# api.upload_file(
#     path_or_fileobj="conversation_tokens.npy",
#     path_in_repo="conversation_tokens.npy",
#     repo_id=file_name,
#     repo_type="dataset",
# )

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\sathi\.cache\huggingface\token
Login successful


conversation_tokens.npy: 100%|██████████| 7.61M/7.61M [00:04<00:00, 1.89MB/s]


CommitInfo(commit_url='https://huggingface.co/datasets/pt-sk/chatgpt-dataset/commit/057c30e74f49ca0714bdc8cba83de6a42b058c0b', commit_message='Upload conversation_tokens.npy with huggingface_hub', commit_description='', oid='057c30e74f49ca0714bdc8cba83de6a42b058c0b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="pt-sk/chatgpt-dataset", filename="conversation_tokens.npy", repo_type="dataset", local_dir=".")

'conversation_tokens.npy'

In [31]:
file_loaded = np.load("conversation_tokens.npy", allow_pickle=True)
file_loaded[1:10], len(file_loaded)

(array([ 4178,    44,  5075, 31754, 40283,  5468,    51, 22312,  2724]),
 1902932)

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

class TokenDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids
        self.block_size = 1024 * 8

    def __len__(self):
        return (len(self.input_ids) - 1) // self.block_size

    def __getitem__(self, idx):     
        start_idx = idx * self.block_size
        end_idx = start_idx + self.block_size
        x = self.input_ids[start_idx:end_idx]
        y = self.input_ids[start_idx+1:end_idx+1]
        
        return torch.LongTensor(x), torch.LongTensor(y)

In [13]:
dataset = TokenDataset(file_loaded)
dataloader = DataLoader(dataset, shuffle=True, batch_size=1, drop_last=True)

In [15]:
len(dataloader)

232

In [16]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F
import gin
from typing import Optional
from functions import precompute_freqs_cis, apply_rotary_emb, repeat_kv

# @gin.configurable
@dataclass
class ModelArgs:
    dim: int = 128
    n_layers: int = 2
    n_heads: int = 4
    n_kv_heads: int = 2
    vocab_size: int = 128256
    multiple_of: int = 64
    ffn_dim_multiplier: float = 1.5
    norm_eps: float = 1e-6
    rope_theta: float = 500000.0
    max_batch_size: int = 1
    max_seq_len: int = 1024 * 8
    attn_dropout: float = 0.0


class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        """
        Initializes the RMSNorm module.

        Args:
            dim: The dimension of the input tensor.
            eps: The epsilon value used to avoid division by zero.
        """
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        """
        Computes the RMSNorm of a tensor.

        Given an input tensor `x`, compute its RMSNorm by dividing it by the root
        mean square of its elements.

        Args:
            x: The input tensor.

        Returns:
            The RMSNorm of the input tensor.
        """
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):        
        """
        Computes the RMSNorm of a tensor and applies a learnable scale factor.

        Args:
            x: The input tensor.

        Returns:
            The RMSNorm of the input tensor multiplied by a learnable scale factor.
        """
        output = self._norm(x.float()).type_as(x)
        return output * self.weight
    

class Attention(nn.Module):
    def __init__(self, args: ModelArgs):
        """
        Initializes the Attention module.

        Args:
            args: An instance of ModelArgs containing configuration parameters such as
                dimensions, number of heads, and maximum sequence length.

        Attributes:
            n_heads: The number of attention heads.
            n_kv_heads: The number of key-value heads (default: same as n_heads).
            n_rep: The number of times to repeat key-value heads if n_kv_heads < n_heads.
            head_dim: The dimension of each attention head.
            wq, wk, wv, wo: Linear layers for queries, keys, values, and output.
        """
        super().__init__()
        self.args = args
        self.n_heads = args.n_heads
        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
        self.n_rep = args.n_heads // self.n_kv_heads
        self.head_dim = args.dim // args.n_heads

        # linear layers for queries, keys, and values
        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)

    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):        
        """
        Computes the output of the attention module.

        Given an input tensor `x`, precomputed frequencies `freqs_cis`, and
        configuration parameters `args`, apply the attention mechanism to produce
        the output.

        Args:
            x: The input tensor.
            freqs_cis: The precomputed frequencies for the rotary embedding.

        Returns:
            The output of the attention module.
        """
        bsz, seqlen, _ = x.shape

        # linear projections for queries, keys, and values
        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)

        # reshape for attention computation
        xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
        xk = xk.view(bsz, seqlen, self.n_kv_heads, self.head_dim)
        xv = xv.view(bsz, seqlen, self.n_kv_heads, self.head_dim)

        # apply rotary embeddings
        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)

        # repeat k/v heads if n_kv_heads < n_heads
        xq = xq.transpose(1, 2)
        xk = repeat_kv(xk, self.n_rep).transpose(1, 2)
        xv = repeat_kv(xv, self.n_rep).transpose(1, 2)

        # compute attention
        y = F.scaled_dot_product_attention(xq, xk, xv, is_causal=True, dropout_p=self.args.attn_dropout)
        y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.n_heads * self.head_dim)

        # output projection
        return self.wo(y)


class FeedForward(nn.Module):
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        multiple_of: int,
        ffn_dim_multiplier: Optional[float],
    ):
        """
        Initializes the FeedForward module.

        Args:
            dim: The input dimension.
            hidden_dim: The hidden dimension.
            multiple_of: The multiple of the hidden dimension.
            ffn_dim_multiplier: An optional float to multiply the hidden dimension by.

        """
        super().__init__()
        hidden_dim = int(2 * hidden_dim / 3)
        # custom dim factor multiplier
        if ffn_dim_multiplier is not None:
            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)

        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)

    def forward(self, x):
        """
        Computes the output of the feed-forward network.

        Given an input tensor `x`, apply two linear layers with the ReLU activation
        function to produce the output.

        Args:
            x: The input tensor.

        Returns:
            The output tensor after applying the feed-forward network.
        """
        
        return self.w2(F.silu(self.w1(x)) * self.w3(x))


class TransformerBlock(nn.Module):
    def __init__(self, args: ModelArgs):
        """
        Initializes the TransformerBlock module.

        Args:
            args: An instance of ModelArgs containing configuration parameters such as
                dimensions, number of heads, and maximum sequence length.

        Attributes:
            n_heads: The number of attention heads.
            dim: The input dimension.
            head_dim: The dimension of each attention head.
            attention: The attention module.
            feed_forward: The feed-forward network module.
            attention_norm: The normalization module for the attention module.
            ffn_norm: The normalization module for the feed-forward network module.
        """
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads
        self.attention = Attention(args)
        self.feed_forward = FeedForward(
            dim=args.dim,
            hidden_dim=4 * args.dim,
            multiple_of=args.multiple_of,
            ffn_dim_multiplier=args.ffn_dim_multiplier,
        )
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor):
        """Computes the output of the transformer block.

        Given an input tensor `x`, precomputed frequencies `freqs_cis`, applies the
        attention module and the feed-forward network module to produce the output.

        Args:
            x: The input tensor.
            freqs_cis: The precomputed frequencies for the rotary embedding.

        Returns:
            The output tensor after applying the transformer block."""
        h = x + self.attention(self.attention_norm(x), freqs_cis)
        out = h + self.feed_forward(self.ffn_norm(h))
        return out



class Transformer(nn.Module):
    def __init__(self, params: ModelArgs) -> None:
        """
        Initializes the Transformer model.

        Args:
            params: An instance of ModelArgs containing configuration parameters such as
                dimensions, number of layers, number of heads, vocabulary size, and other
                hyperparameters.

        Attributes:
            params: Stores the configuration parameters.
            vocab_size: The size of the vocabulary.
            n_layers: The number of transformer layers.
            tok_embeddings: The token embedding layer.
            layers: A list of TransformerBlock layers.
            norm: An RMSNorm layer for normalizing the output.
            output: A linear layer for generating output logits.
            freqs_cis: Precomputed frequencies for rotary embeddings.
        """
        super().__init__()
        self.params = params
        self.vocab_size = params.vocab_size
        self.n_layers = params.n_layers

        self.tok_embeddings = nn.Embedding(self.vocab_size, params.dim)

        self.layers = torch.nn.ModuleList()
        for layer_id in range(params.n_layers):
            self.layers.append(TransformerBlock(params))

        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)

        self.freqs_cis = precompute_freqs_cis(
            params.dim // params.n_heads,
            params.max_seq_len, # here max_seq_len * 2 was used before if any error occurs change back to max_seq_len * 2
            params.rope_theta,
        )

        # tie the weights of the token embeddings and the output layer
        self.tok_embeddings.weight = self.output.weight

    def forward(self, tokens: torch.Tensor, target: torch.Tensor=None):
        """Computes the output of the model.

        Given an input tensor `tokens` of shape `(B, T)`, where `B` is the batch size and
        `T` is the sequence length, applies the model to produce an output tensor of shape
        `(B, T, V)`, where `V` is the vocabulary size.

        If `target` is provided, computes the cross-entropy loss between the output and the
        target.

        Args:
            tokens: The input tensor.
            target: The target tensor.

        Returns:
            A tuple of two tensors, the output tensor and the loss tensor. If `target` is
            not provided, the loss tensor is `None`."""
        B, T = tokens.shape
        assert T <= self.params.max_seq_len, f"Sequence length {T} exceeds maximum sequence length {self.params.max_seq_len}"
        assert B <= self.params.max_batch_size, f"Batch size {B} exceeds maximum batch size {self.params.max_batch_size}"

        h = self.tok_embeddings(tokens)
        self.freqs_cis = self.freqs_cis.to(h.device)
        
        for layer in self.layers:
            h = layer(h, self.freqs_cis)
        
        h = self.norm(h)
        output = self.output(h)

        loss = None
        if target is not None:
            loss = F.cross_entropy(output.view(-1, output.size(-1)), target.view(-1))
        
        return output, loss

In [17]:
params = ModelArgs()

model = Transformer(params)

In [18]:
item = next(iter(dataloader))
tokens, target = item
tokens.shape, target.shape

(torch.Size([1, 8192]), torch.Size([1, 8192]))

In [19]:
output, loss = model(tokens, target)
output.shape, loss

(torch.Size([1, 8192, 128256]), tensor(11.9098, grad_fn=<NllLossBackward0>))

In [23]:
import math
-math.log(1/128256)

11.761783545564427

In [None]:
import torch
from torch.utils.data import IterableDataset
from model import ModelArgs

class TokenDataset(IterableDataset):
    def __init__(self, model_args: ModelArgs, input_file: str):
        """
        Initializes the TokenDataset for lazy loading from file.

        Args:
            model_args: An instance of ModelArgs containing model configuration
                parameters, including the maximum sequence length.
            input_file: Path to the file containing tokenized input data.
        """
        self.model_args = model_args
        self.block_size = model_args.max_seq_len
        self.input_file = input_file

    def __len__(self):
        """
        Returns the number of blocks in the dataset.
        
        Since the dataset is being loaded lazily, this method could be optimized
        or skipped for large datasets that are not fully loaded in memory.
        """
        # This method is optional and can be skipped for very large datasets
        # where length cannot be easily determined. It's provided for completeness.
        pass

    def __iter__(self):
        """
        Lazily loads and yields token blocks from the input file.

        Each block is a pair of x (input) and y (output) tensors of size block_size.

        Yields:
            Tuple of tensors: (input, output) tensors for the given block.
        """
        # Open the file containing the tokenized data
        with open(self.input_file, 'r') as file:
            input_ids = []
            for line in file:
                # Assume each line contains a space-separated tokenized sequence
                tokens = list(map(int, line.strip().split()))  # Convert tokens to integers
                input_ids.extend(tokens)

        # Process the tokens in blocks
        for idx in range(0, len(input_ids) - self.block_size, self.block_size):
            x = input_ids[idx:idx+self.block_size]
            y = input_ids[idx+1:idx+self.block_size+1]

            # Yield the current block as a tuple (x, y)
            yield torch.LongTensor(x), torch.LongTensor(y)


In [None]:
# import json

# # Load the JSON file
# with open("chatgpt_dataset/conversations_3.json", "r", encoding="utf-8") as file:
#     data = json.load(file)

# for x, y in data[3]["mapping"].items():
#     if y["message"] is not None:
#         print(y["message"]["content"]["parts"])

# for x, y in data[3]["mapping"].items():
#     if y["message"] is not None and y["message"]["author"]["role"] != "system":
#         print(data[3]["title"])
#         # make first letter of role uppercase
#         print(y["message"]["author"]["role"].capitalize())
#         print(y["message"]["content"]["parts"])


# import json
# import os
# import multiprocessing as mp
# from tqdm import tqdm  # Optional: to display progress bar

# # Function to process a single file
# def process_file(file):
#     local_conversation = []  # Local list to store conversation for this file
#     with open(file, "r", encoding="utf-8") as f:
#         data = json.load(f)
        
#         for item in data:
#             chat = item["title"]
#             for x, y in item["mapping"].items():
#                 if y["message"] is not None and y["message"]["author"]["role"] != "system":
#                     try:
#                         chat += f" {y['message']['author']['role'].capitalize()}: {y['message']['content']['parts'][0]}"
#                     except Exception as e:
#                         pass
#             local_conversation.append(chat)
#     return local_conversation

# # Function to combine results from all files
# def process_all_files(files):
#     # Use a Pool of workers to process files in parallel
#     with mp.Pool(processes=os.cpu_count()) as pool:
#         # Using `tqdm` to track progress
#         results = list(tqdm(pool.imap(process_file, files), total=len(files)))
    
#     # Combine results into a single conversation list
#     all_conversations = []
#     for result in results:
#         all_conversations.extend(result)
    
#     return all_conversations

# # List of files to process
# files = ["chatgpt_dataset/conversations_1.json", "chatgpt_dataset/conversations_2.json", "chatgpt_dataset/conversations_3.json"]

# # Process the files and get the conversation data
# conversation = process_all_files(files)