In [11]:
import tensorflow as tf
import tensorflow_datasets as tfds
import re
import os

In [12]:
os.makedirs("./data/wiki40b-txt/")

In [8]:
lang_code = "en"
r1 = "_START_ARTICLE_\n[^_]*"
r2 = "_START_PARAGRAPH_\n"
r3 = "_START_SECTION_\n[^_]*"
r4 = "_NEWLINE_"

REGEX = re.compile(f"({r1}|{r2}|{r3}|{r4})")

## Generating txt files from the Wiki-40b dataset

In [9]:
def process_tf_dataset(ds, num_tokens, output_file):
    # Turn to a numpy df so that we can easily extract text
    # numpy_items = tfds.as_numpy(ds)
    token_count = 0

    with open(output_file, "a") as f:
        for batch in ds.as_numpy_iterator():
            # text is the feature we want to extract
            for item in batch.get("text"):
                text = item.decode("UTF-8")
                text = re.sub(REGEX, " ", text)
                text = re.sub("\s+", " ", text).strip()
                f.write(text)
                f.write("\n")
                token_count += len(text.split())
                if num_tokens > 0 and token_count > num_tokens:
                    break

In [20]:
# load from validation split of the wiki-40b dataset, which has 163597 entries.
# Don't run repeatedly once you loaded.
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="train",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [21]:
# generating pure txt file for train split in wiki-40b
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".train"
)

2023-07-18 23:14:18.518598: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [128]
	 [[{{node Placeholder/_0}}]]
2023-07-18 23:14:18.519921: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype string and shape [128]
	 [[{{node Placeholder/_2}}]]


In [18]:
# don't know why changing the split matters
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="test",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [None]:
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".test"
)

In [None]:
ds = tfds.load(
    f"wiki40b/{lang_code}",
    split="validation",
    shuffle_files=True,
    data_dir="./data/",
    batch_size=128,
)

In [None]:
process_tf_dataset(
    ds, -1, "./data/wiki40b-txt/" + lang_code + ".validation"
)
# why this has so many entries?
# 3 and 300 have same results, but -1 a lot more.

In [22]:
dirname = os.path.dirname("wiki40b-txt/")

In [23]:
dirname

'wiki40b-txt'