<a href="https://colab.research.google.com/github/sarthakbiswas97/design-llm-apps-exercises/blob/main/Checking_PII.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**EXERCISE: Use the search function in the dataset viewer for the [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) pre-training dataset to assess presence of PII. For example, search for “gmail.com.” What do you find?**

In [None]:
!pip install -U datasets

In [None]:
# Run this immediately in a fresh runtime
import shutil
import os

print("Attempting to clear Hugging Face datasets cache...")

# More comprehensive list of potential cache/hub locations
paths_to_clear = [
    "/root/.cache/huggingface/datasets/tiiuae___falcon-refinedweb", # Specific dataset cache
    "/root/.cache/huggingface/hub/datasets--tiiuae--falcon-refinedweb", # Specific dataset downloads/repo info
    "/root/.cache/huggingface/datasets", # Entire datasets cache (more aggressive)
]

for path_to_clear in paths_to_clear:
    if os.path.exists(path_to_clear):
        print(f"Found: {path_to_clear}")
        try:
            # If it's a directory, remove the whole tree
            if os.path.isdir(path_to_clear):
                shutil.rmtree(path_to_clear)
                print(f"Successfully removed directory: {path_to_clear}")
            # If it's a file (less likely for these paths, but just in case)
            elif os.path.isfile(path_to_clear):
                os.remove(path_to_clear)
                print(f"Successfully removed file: {path_to_clear}")
        except Exception as e:
            print(f"Error removing {path_to_clear}: {e}")
    else:
        print(f"Not found (this is okay): {path_to_clear}")

print("Cache clearing attempt finished. Please run your dataset loading script in the next cell.")


In [None]:

from datasets import load_dataset

DATASET_NAME = "tiiuae/falcon-refinedweb"
DATASET_SPLIT = "train"
SEARCH_TERM = "gmail.com" # Or whatever you're searching for
MAX_FINDS = 5
MAX_EXAMPLES_TO_SCAN = 10000 # Start small to test, then increase if needed

print(f"Attempting to load dataset: {DATASET_NAME} (streaming)")
try:
    streaming_dataset = load_dataset(
        DATASET_NAME,
        split=DATASET_SPLIT,
        streaming=True
    )
    print("Dataset loaded successfully in streaming mode.")

    found_count = 0
    examples_scanned = 0
    print(f"Searching for '{SEARCH_TERM}'...")

    for example in streaming_dataset:
        examples_scanned += 1
        text_to_search = None
        if "content" in example and example["content"] is not None:
            text_to_search = example["content"]
        elif "text" in example and example["text"] is not None: # Fallback
            text_to_search = example["text"]

        if text_to_search and SEARCH_TERM.lower() in text_to_search.lower():
            found_count += 1
            print(f"\n--- Found Match #{found_count} (in example #{examples_scanned}) ---")
            snippet_length = 300
            match_index = text_to_search.lower().find(SEARCH_TERM.lower())
            start = max(0, match_index - (snippet_length // 2))
            end = min(len(text_to_search), match_index + len(SEARCH_TERM) + (snippet_length // 2))
            print(f"...{text_to_search[start:end]}...")

        if found_count >= MAX_FINDS:
            print(f"\nReached max finds ({MAX_FINDS}). Stopping search.")
            break
        if MAX_EXAMPLES_TO_SCAN is not None and examples_scanned >= MAX_EXAMPLES_TO_SCAN:
            print(f"\nReached max examples to scan ({MAX_EXAMPLES_TO_SCAN}). Stopping search.")
            break
        if examples_scanned % 1000 == 0:
            print(f"Scanned {examples_scanned} examples, found {found_count} matches so far...")

    print(f"\n--- Search Summary ---")
    print(f"Total examples scanned: {examples_scanned}")
    print(f"Total matches found for '{SEARCH_TERM}': {found_count}")

except Exception as e:
    print(f"Error loading or processing dataset: {e}")

