In [9]:
import json
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  

conversation = []

# Function to process a single file
def process_file(file):
  conversation = []
  with open(file, "r", encoding="utf-8") as f:
    data = json.load(f)

    for item in data:
      chat = item["title"]
      for x, y in item["mapping"].items():
        if y["message"] is not None and y["message"]["author"]["role"] != "system":
          try:
            if len(y['message']['content']['parts'][0]) > 0:
              chat += f" {y['message']['author']['role'].capitalize()}: {y['message']['content']['parts'][0]}"
          except Exception as e:
            pass
      conversation.append(chat)
    return conversation

# Function to combine results from all files
def process_all_files(files):
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        results = list(tqdm(executor.map(process_file, files), total=len(files)))

    # Combine results into a single conversation list
    all_conversations = []
    for result in results:
        all_conversations.extend(result)

    return all_conversations

# Get the list of all files
files = [os.path.join("chatgpt_dataset", file) for file in os.listdir("chatgpt_dataset")]

# Process the files and get the conversation data
conversation = process_all_files(files)

len(conversation)

100%|██████████| 3/3 [00:00<?, ?it/s]


1088

In [12]:
# from huggingface_hub import login
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
"pt-sk/ll-3.2-1B_Instruct",
)

In [15]:
import concurrent.futures

# Define the tokenization function
def tokenize_text(text):
    return tokenizer.encode(text)

# Use ThreadPoolExecutor to tokenize texts in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Map the tokenize_text function to each string in texts
    tokenized_texts = list(executor.map(tokenize_text, conversation))

In [16]:
len(tokenized_texts)

1088

In [17]:
from itertools import chain

tokens = list(chain.from_iterable(tokenized_texts))

len(tokens)

1902932

In [20]:
import numpy as np

# Save the tokenized texts to a numpy file
file = np.array(tokens)
np.save("conversation_tokens.npy", file)

In [25]:
file_loaded = np.load("conversation_tokens.npy", allow_pickle=True)
file_loaded[1:10]

array([ 4178,    44,  5075, 31754, 40283,  5468,    51, 22312,  2724])

In [26]:
len(file_loaded)

1902932

In [None]:
# file_name = "pt-sk/chatgpt-dataset"

# from huggingface_hub import HfApi, login
# login(token)

# api = HfApi()
# api.upload_file(
#     path_or_fileobj="conversation_tokens.npy",
#     path_in_repo="conversation_tokens.npy",
#     repo_id=file_name,
#     repo_type="dataset",
# )

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\sathi\.cache\huggingface\token
Login successful


conversation_tokens.npy: 100%|██████████| 7.61M/7.61M [00:04<00:00, 1.89MB/s]


CommitInfo(commit_url='https://huggingface.co/datasets/pt-sk/chatgpt-dataset/commit/057c30e74f49ca0714bdc8cba83de6a42b058c0b', commit_message='Upload conversation_tokens.npy with huggingface_hub', commit_description='', oid='057c30e74f49ca0714bdc8cba83de6a42b058c0b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="pt-sk/chatgpt-dataset", filename="conversation_tokens.npy", repo_type="dataset", local_dir=".")

'conversation_tokens.npy'

In [31]:
file_loaded = np.load("conversation_tokens.npy", allow_pickle=True)
file_loaded[1:10], len(file_loaded)

(array([ 4178,    44,  5075, 31754, 40283,  5468,    51, 22312,  2724]),
 1902932)

In [32]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

class TokenDataset(Dataset):
    def __init__(self, input_ids):
        self.input_ids = input_ids
        self.block_size = 1024 * 8

    def __len__(self):
        return (len(self.input_ids) - 1) // self.block_size

    def __getitem__(self, idx):     
        start_idx = idx * self.block_size
        end_idx = start_idx + self.block_size
        x = self.input_ids[start_idx:end_idx]
        y = self.input_ids[start_idx+1:end_idx+1]
        
        return torch.LongTensor(x), torch.LongTensor(y)

In [35]:
dataset = TokenDataset(tokens)
dataloader = DataLoader(dataset, shuffle=True, batch_size=1, drop_last=True)

In [36]:
len(dataloader)

232

In [None]:
# import json

# # Load the JSON file
# with open("chatgpt_dataset/conversations_3.json", "r", encoding="utf-8") as file:
#     data = json.load(file)

# for x, y in data[3]["mapping"].items():
#     if y["message"] is not None:
#         print(y["message"]["content"]["parts"])

# for x, y in data[3]["mapping"].items():
#     if y["message"] is not None and y["message"]["author"]["role"] != "system":
#         print(data[3]["title"])
#         # make first letter of role uppercase
#         print(y["message"]["author"]["role"].capitalize())
#         print(y["message"]["content"]["parts"])


# import json
# import os
# import multiprocessing as mp
# from tqdm import tqdm  # Optional: to display progress bar

# # Function to process a single file
# def process_file(file):
#     local_conversation = []  # Local list to store conversation for this file
#     with open(file, "r", encoding="utf-8") as f:
#         data = json.load(f)
        
#         for item in data:
#             chat = item["title"]
#             for x, y in item["mapping"].items():
#                 if y["message"] is not None and y["message"]["author"]["role"] != "system":
#                     try:
#                         chat += f" {y['message']['author']['role'].capitalize()}: {y['message']['content']['parts'][0]}"
#                     except Exception as e:
#                         pass
#             local_conversation.append(chat)
#     return local_conversation

# # Function to combine results from all files
# def process_all_files(files):
#     # Use a Pool of workers to process files in parallel
#     with mp.Pool(processes=os.cpu_count()) as pool:
#         # Using `tqdm` to track progress
#         results = list(tqdm(pool.imap(process_file, files), total=len(files)))
    
#     # Combine results into a single conversation list
#     all_conversations = []
#     for result in results:
#         all_conversations.extend(result)
    
#     return all_conversations

# # List of files to process
# files = ["chatgpt_dataset/conversations_1.json", "chatgpt_dataset/conversations_2.json", "chatgpt_dataset/conversations_3.json"]

# # Process the files and get the conversation data
# conversation = process_all_files(files)