In [3]:
import re
from tqdm import tqdm


def preprocess_vocab(input_vocab_file, output_vocab_file):
    cleaned_vocab = set()

    # Count total lines first
    with open(input_vocab_file, "r", encoding="utf-8") as vfile:
        total_lines = sum(1 for _ in vfile)

    # Read with progress bar
    with open(input_vocab_file, "r", encoding="utf-8") as vfile:
        for line in tqdm(vfile, total=total_lines, desc=f"Processing vocabulary", unit="lines"):
            char = line.strip()
            char = re.sub(r'[^\x00-\x7F]+', '', char)
            if char:
                cleaned_vocab.add(char)

    # Write with progress bar
    with open(output_vocab_file, "w", encoding="utf-8") as vfile:
        for char in tqdm(cleaned_vocab, desc="Writing vocabulary", unit="chars"):
            vfile.write(char + '\n')
# File paths for your existing dataset
vocab_file = "vocab_wiki.txt"


print("Starting preprocessing...")

# Preprocess files with progress bars
preprocess_vocab(vocab_file, "vocab_wiki_cleaned.txt")


print("\nPreprocessing completed!")
print(f"Cleaned vocabulary saved to vocab_cleaned.txt")



Starting preprocessing...


Processing vocabulary: 100%|████████████████████████████████████████████████| 4822/4822 [00:00<00:00, 322695.40lines/s]
Writing vocabulary: 100%|████████████████████████████████████████████████████████| 94/94 [00:00<00:00, 94074.11chars/s]


Preprocessing completed!
Cleaned vocabulary saved to vocab_cleaned.txt





In [4]:
import re
from tqdm import tqdm



def preprocess_text_file(input_file, output_file):
    # Count total lines first
    with open(input_file, "r", encoding="utf-8") as infile:
        total_lines = sum(1 for _ in infile)

    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        # Add progress bar
        for line in tqdm(infile, total=total_lines, desc=f"Processing {input_file}", unit="lines"):
            cleaned_line = re.sub(r'[^\x00-\x7F]+', '', line)
            cleaned_line = re.sub(r'[^a-zA-Z0-9\s.,;:!?\'"()’]+', '', cleaned_line)
            outfile.write(cleaned_line + '\n')


output_file_val = "val_wiki.txt"
output_file_train = "train_wiki.txt"


print("Starting preprocessing...")

preprocess_text_file(output_file_val, "val_wiki_cleaned.txt")
preprocess_text_file(output_file_train, "train_wiki_cleaned.txt")

print("\nPreprocessing completed!")
print(f"Cleaned validation data saved to val_split_cleaned.txt")

print(f"Cleaned training data saved to train_split_cleaned.txt")


Starting preprocessing...


Processing val_wiki.txt: 100%|█████████████████████████████████████████████| 71731/71731 [00:04<00:00, 15664.86lines/s]
Processing train_wiki.txt: 100%|█████████████████████████████████████████| 645578/645578 [00:35<00:00, 18127.06lines/s]


Preprocessing completed!
Cleaned validation data saved to val_split_cleaned.txt
Cleaned training data saved to train_split_cleaned.txt



