In [1]:
import unicodedata
import re
from tqdm import tqdm
from datasets import load_dataset, DatasetDict
import pandas as pd

def preprocess_text(
    text: str,
    to_lower: bool = False,
    min_tokens: int = 5,
    noise_thresh: float = 0.5,
    mask_numbers: bool = True,
) -> str:
    """
    Perform comprehensive text cleaning to make data tokenizer-friendly:
      1. Unicode normalization
      2. Strip HTML tags and wiki-style markup
      3. Normalize quotes/dashes
      4. Remove control chars & collapse whitespace
      5. Surround punctuation with spaces
      6. Remove very short / noisy lines
      7. Mask numbers
      8. Deduplicate sentences
    """
    # 1. Unicode normalize
    text = unicodedata.normalize("NFKC", text)
    
    # 2. Strip HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    #    Strip wiki headings and templates
    text = re.sub(r"^==+.*==+$", " ", text, flags=re.MULTILINE)
    text = re.sub(r"\{\{.*?\}\}", " ", text, flags=re.DOTALL)
    
    # 3. Normalize quotes and dashes
    text = text.replace("“", '"').replace("”", '"')
    text = text.replace("‘", "'").replace("’", "'")
    text = re.sub(r"[–—]", "-", text)
    
    # 4. Remove newlines/tabs and collapse whitespace
    text = re.sub(r"[\r\n\t]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    if to_lower:
        text = text.lower()
    
    # 5. Surround punctuation so it's tokenized separately
    text = re.sub(r'([.,!?;:\(\)\[\]"\-])', r' \1 ', text)
    text = re.sub(r"\s+", " ", text).strip()
    
    # 6. Remove very short or noisy lines
    lines = text.split(". ")
    clean_lines = []
    for line in lines:
        tokens = line.split()
        if len(tokens) < min_tokens:
            continue
        non_word = sum(1 for c in line if not (c.isalnum() or c.isspace()))
        if non_word / max(1, len(line)) > noise_thresh:
            continue
        clean_lines.append(line)
    text = ". ".join(clean_lines)
    
    # 7. Mask all standalone numbers
    if mask_numbers:
        text = re.sub(r"\b\d+(\.\d+)?\b", "<NUM>", text)
    
    # 8. Deduplicate sentences
    sents = re.split(r'(?<=[\.\!\?])\s+', text)
    seen, uniq = set(), []
    for s in sents:
        if s and s not in seen:
            seen.add(s)
            uniq.append(s)
    text = " ".join(uniq)
    
    return text

def load_and_preprocess(
    dataset_name: str = "wikitext",
    dataset_config: str = "wikitext-103-raw-v1",
    splits: tuple[str, ...] = ("train", "validation", "test")
) -> DatasetDict:
    """
    Load the given HF dataset splits, apply preprocessing, and return a DatasetDict.
    """
    # Load all splits into a DatasetDict
    ds_dict = load_dataset(dataset_name, dataset_config)
    
    # Apply preprocessing to each split
    for split in splits:
        ds = ds_dict[split]
        ds = ds.map(
            lambda ex: {"text": preprocess_text(ex["text"], to_lower=False, mask_numbers=False)},
            remove_columns=[c for c in ds.column_names if c != "text"],
            batched=False,  # one example at a time
        )
        ds_dict[split] = ds
    return ds_dict

def save_to_df(ds: DatasetDict) -> pd.DataFrame:
    """
    Convert the DatasetDict to a DataFrame for easy inspection.
    """
    section = None
    records = []
    heading_re = re.compile(r'^[= ]+(.+?)[= ]+$')

    for row in ds:
        if row:
            line = row.strip()
            m = heading_re.match(line)
            if m:
                section = m.group(1)
            else:
                records.append((section, row))
            
    return pd.DataFrame(records, columns=["Section","Text"])

if __name__ == "__main__":
    # Load & preprocess
    processed = load_and_preprocess()
    
    # Save to disk for fast reload later
    # processed.save_to_disk("data/processed_wikitext103")
    
    # Save to DF for easy inspection
    dataset_train = save_to_df(processed["train"]['text'])
    dataset_valid = save_to_df(processed["validation"]['text'])
    dataset_test = save_to_df(processed["test"]['text'])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
processed['train']

Dataset({
    features: ['text'],
    num_rows: 1801350
})

In [3]:
processed['train']['text'][:10]

['',
 '= Valkyria Chronicles III =',
 '',
 'Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @ - @ playing video game developed by Sega and Media . Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @ - @ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " .',
 "The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forg

In [4]:
dataset_train.head(10)

Unnamed: 0,Section,Text
0,Valkyria Chronicles III,Senjō no Valkyria 3 : Unrecorded Chronicles ( ...
1,Valkyria Chronicles III,"The game began development in 2010 , carrying ..."
2,Valkyria Chronicles III,"It met with positive sales in Japan , and was ..."
3,Gameplay,"As with previous Valkyira Chronicles games , V..."
4,Gameplay,"The game 's battle system , the BliTZ system ,..."
5,Gameplay,Troops are divided into five classes : Scouts ...
6,Plot,The game takes place during the Second Europan...
7,Plot,"As the Nameless officially do not exist , the ..."
8,Plot,"Partly due to these events , and partly due to..."
9,Development,Concept work for Valkyria Chronicles III began...


In [5]:
dataset_train.describe()

Unnamed: 0,Section,Text
count,832788,832788
unique,111725,817953
top,History,Note : Flags indicate national team as defined...
freq,14511,92


In [6]:
# Save only the Text column to a single file for training
output_train_file = "output_train.txt"
print(f"Saving {len(dataset_train)} text samples to {output_train_file}...")

# Save the text content to a single file
with open(output_train_file, "w", encoding="utf-8") as f:
    for text in tqdm(dataset_train["Text"]):
        if text and isinstance(text, str) and len(text.strip()) > 0:
            f.write(text.strip() + "\n\n")

print(f"Data saved to {output_train_file}")

# Check the first few lines of the saved file
with open(output_train_file, "r", encoding="utf-8") as f:
    preview = f.read(1000)
print("\nPreview of saved data:")
print(preview[:500], "...")



output_valid_file = "output_valid.txt"
print(f"Saving {len(dataset_valid)} text samples to {output_valid_file}...")
with open(output_valid_file, "w", encoding="utf-8") as f:
    for text in tqdm(dataset_valid["Text"]):
        if text and isinstance(text, str) and len(text.strip()) > 0:
            f.write(text.strip() + "\n\n")

print(f"Data saved to {output_valid_file}")

# Check the first few lines of the saved file
with open(output_valid_file, "r", encoding="utf-8") as f:
    preview = f.read(1000)
print("\nPreview of saved data:")
print(preview[:500], "...")

Saving 832788 text samples to output_train.txt...


100%|██████████| 832788/832788 [00:01<00:00, 683921.99it/s]


Data saved to output_train.txt

Preview of saved data:
Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @ - @ playing video game developed by Sega and Media . Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @ - @ time gameplay as its predecessors , the story runs parallel to the first game  ...
Saving 1795 text samples to output_valid.txt...


100%|██████████| 1795/1795 [00:00<00:00, 565011.31it/s]

Data saved to output_valid.txt

Preview of saved data:
Homarus gammarus , known as the European lobster or common lobster , is a species of clawed lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the ...



