In [7]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("mikasenghaas/wikitext-2", trust_remote_code=True)

# Split into training and validation datasets (90% train, 10% val)
split_dataset = dataset['train'].train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

# Print structure of a sample entry
print("Sample entry structure:", train_dataset[0])

# Initialize file names
output_file_train = "train_wiki.txt"
output_file_val = "val_wiki.txt"
vocab_file = "vocab_wiki.txt"

# Initialize an empty set for vocabulary
vocab = set()

# Write training data and build vocabulary
with open(output_file_train, "w", encoding="utf-8") as f:
    for entry in train_dataset:
        # Adjust the field name if different
        text = entry['text']
        f.write(text)
        characters = set(text)
        vocab.update(characters)

# Write validation data
with open(output_file_val, "w", encoding="utf-8") as f:
    for entry in val_dataset:
        # Adjust the field name if different
        text = entry['text']
        f.write(text)
        characters = set(text)
        vocab.update(characters)

# Write vocabulary to a file
with open(vocab_file, "w", encoding="utf-8") as vfile:
    for char in vocab:
        vfile.write(char + '\n')

print(f"Training data saved to {output_file_train}")
print(f"Validation data saved to {output_file_val}")
print(f"Vocabulary saved to {vocab_file}")


Sample entry structure: {'text': "Chapman 's archipelago frigates provided better protection for their crew than the galleys they replaced , and up to three times the capacity for stores and provisions . They could operate in the narrow , shallow waters around skerries in all weathers and in open water in all but the worst storms . They had a deeper draft than galleys , but considerably shallower draft than traditional sailing warships . The new ship types also increased the archipelago fleet 's firepower , provided it with better defensive capabilities , and made possible more efficient fire support in amphibious operations ."}
Training data saved to train_wiki.txt
Validation data saved to val_wiki.txt
Vocabulary saved to vocab_wiki.txt
