In [None]:
import os
import lzma
import tarfile
import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import shutil
import pandas as pd

In [None]:


for path in os.listdir("openwebtext"):
    if not path.endswith(".xz"):
        # delete the file or directory
        full_path = os.path.join("openwebtext", path)
        if os.path.isdir(full_path):
            shutil.rmtree(full_path)
        else:
            os.remove(full_path)

In [None]:


root = "openwebtext"
xz_files = [f for f in os.listdir(root) if f.endswith(".xz")]

# Parquet writer ----------------------------------------------------------------
parquet_path = os.path.join(root, "openwebtext.parquet")
schema = pa.schema([
    ("source", pa.string()),   # filename inside archive or any ID you like
    ("text",   pa.string())    # the document itself
])
writer = pq.ParquetWriter(parquet_path, schema, compression="zstd")

# Helper for batching -----------------------------------------------------------
BATCH_SIZE = 1_000
buffer = {"source": [], "text": []}

def flush():
    """Write the current buffer to parquet and clear it."""
    if buffer["text"]:                      # non-empty
        table = pa.Table.from_pydict(buffer, schema=schema)
        writer.write_table(table)
        buffer["source"].clear()
        buffer["text"].clear()

# Main extraction loop ----------------------------------------------------------
for xz_file in tqdm.tqdm(xz_files, desc="xz archives"):
    xz_path = os.path.join(root, xz_file)

    # 1. stream-decompress the .xz
    with lzma.open(xz_path) as lzma_file:
        # 2. open the tar stream
        with tarfile.open(fileobj=lzma_file) as tar:
            for member in tar.getmembers():
                if member.isfile() and member.name.endswith(".txt"):
                    member_file = tar.extractfile(member)
                    if member_file is None:
                        continue

                    # 3. read bytes -> decode -> append to buffer
                    txt = member_file.read().decode("utf-8", errors="ignore")
                    buffer["source"].append(
                        f"{xz_file[:-3]}_{os.path.basename(member.name)}"
                    )
                    buffer["text"].append(txt)

                    # 4. flush every BATCH_SIZE rows
                    if len(buffer["text"]) >= BATCH_SIZE:
                        flush()

# final flush, close parquet ----------------------------------------------------
flush()
writer.close()

print(f"Finished. All texts are inside: {parquet_path}")

xz archives: 100%|██████████| 1000/1000 [03:13<00:00,  5.17it/s]

Finished. All texts are inside: openwebtext\openwebtext.parquet





In [8]:
import pandas as pd

In [None]:
# import pandas as pd
# import os
# import math
# import psutil

# # Load your full dataset
# df = pd.read_parquet("openwebtext/openwebtext.parquet")

# # Estimate size in memory
# mem_bytes = df.memory_usage(deep=True).sum()
# mem_mb = mem_bytes / 1024**2
# print(f"Loaded size: {mem_mb:.2f} MB")

# # Define target size per split (adjust to taste, baddie)
# target_mb = 300
# n_splits = math.ceil(mem_mb / target_mb)

# # Split and save
# os.makedirs("openwebtext/splits", exist_ok=True)

# chunk_size = math.ceil(len(df) / n_splits)

# for i in range(n_splits):
#     start = i * chunk_size
#     end = min((i + 1) * chunk_size, len(df))
#     chunk = df.iloc[start:end]
#     out_path = f"openwebtext/splits/openwebtext_part_{i:02d}.parquet"
#     chunk.to_parquet(out_path, index=False)
#     print(f"Saved {out_path} with {len(chunk)} rows")


Loaded size: 3400.46 MB
Saved openwebtext/splits/openwebtext_part_00.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_01.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_02.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_03.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_04.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_05.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_06.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_07.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_08.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_09.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_10.parquet with 32352 rows
Saved openwebtext/splits/openwebtext_part_11.parquet with 32350 rows


In [16]:
df = pd.read_parquet("owt/owt_part_00.parquet")

In [20]:
import string

allowed_chars = set(string.ascii_letters + string.digits + string.punctuation + " \n\t")

def clean_text(text):
    text = "".join(c for c in text if c in allowed_chars)
    return text

In [24]:
df['text'].apply(len).sum()

np.int64(159341890)

In [27]:
for path in os.listdir("owt"):
    if path.endswith(".parquet"):
        df = pd.read_parquet(f"owt/{path}")
        df['text'] = df['text'].apply(clean_text)
        df.to_parquet(f"owt/{path}")