In [10]:
import sys
import os

from DistributedSim.dataset.build_dataset import *
from DistributedSim.dataset.dataset import *

import numpy as np
import boto3
from tqdm import tqdm
import io

In [None]:
data_file, vocab_size = build_dataset('owt', end_pc=0.01)

Loading dataset: owt (GPT2 tokenization) start%: 0.0 end%: 0.01
Total records to import: 8013769
Using 80137 records: 0 to 80137
Dataset size: (54523, 1024)


In [6]:
def upload_numpy_array_in_chunks(data, bucket_name, object_name_prefix, chunk_size=1000):
    """
    Splits a numpy array into chunks using numpy slicing and uploads each chunk to S3.
    
    :param data: Numpy array to upload
    :param bucket_name: Name of the S3 bucket
    :param object_name_prefix: Prefix for S3 object names (chunks will be named prefix_0, prefix_1, etc.)
    :param chunk_size: Number of elements per chunk
    """
    # Initialize S3 client
    s3_client = boto3.client('s3')

    num_examples, block_size = data.shape
    
    uploaded_chunks = []
    
    for i in tqdm(range(num_examples // chunk_size + 1), desc="Uploading chunks", unit="chunk"):
        # Get chunk using numpy slicing
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, len(data))
        chunk = data[start_idx:end_idx]
        
        # Create a name for this chunk
        chunk_name = f"{object_name_prefix}/chunk_{i}.npy"
        
        try:
            # Save chunk to a bytes buffer
            buffer = io.BytesIO()
            np.save(buffer, chunk)
            buffer.seek(0)
            
            # Upload to S3
            s3_client.upload_fileobj(buffer, bucket_name, chunk_name)
            
            uploaded_chunks.append(chunk_name)
            
        except Exception as e:
            print(f"Error uploading chunk {i}: {e}")
            return False
    
    
    print(f"Successfully uploaded {len(uploaded_chunks)} chunks to {bucket_name} with prefix {object_name_prefix}")
    return uploaded_chunks

bucket_name = 'exo-datasets'  # Replace with your S3 bucket name
object_name_prefix = 'owt'  # The prefix for chunk names

uploaded_chunks = upload_numpy_array_in_chunks(data_file, bucket_name, object_name_prefix)

Uploading chunks:   0%|          | 0/55 [00:02<?, ?chunk/s]


KeyboardInterrupt: 

In [11]:
dataset = get_dataset('owt', 0, 0.1)

[0 1 2 3 4]


In [14]:
dataset[0].shape

(5000, 1024)

In [4]:
dataset[0]

(tensor([13924,    12,   559,  ...,  4327,   284,  3236]),
 tensor([  12,  559,   12,  ...,  284, 3236, 3146]))