# Data Pipeline Visualization

This notebook helps visualize the data at different stages of the pipeline: Raw, Cleaned, and Parallel.

**Pre-requisites**:
Ensure you have run the ingestion and cleaning scripts first.

In [None]:
import os
from datasets import load_from_disk
import pandas as pd

pd.set_option('display.max_colwidth', None)

def show_samples(path, name, num_samples=5):
    if not os.path.exists(path):
        print(f"Dataset not found at {path}. Please run the respective ingestion/cleaning script.")
        return
    
    print(f"--- {name} ---")
    try:
        ds = load_from_disk(path)
        print(f"Total Samples: {len(ds)}")
        df = pd.DataFrame(ds.select(range(min(len(ds), num_samples))))
        display(df)
    except Exception as e:
        print(f"Error loading dataset: {e}")
    print("\n")

## 1. English Data (Wikitext)

In [None]:
# Raw English Data
show_samples('../../data/wikitext_wikitext-2-raw-v1_train', 'Raw Wikitext (English)')

# Cleaned English Data
show_samples('../../data/cleaned_wikitext_train', 'Cleaned Wikitext (English)')

## 2. Bengali Data (Wikipedia)

In [None]:
# Raw Bengali Data
show_samples('../../data/wikimedia_wikipedia_20231101.bn_train', 'Raw Wikipedia (Bengali)')

# Cleaned Bengali Data
show_samples('../../data/cleaned_wikipedia_bn_train', 'Cleaned Wikipedia (Bengali)')

## 3. Parallel Data (English-Bengali)

In [None]:
# Parallel Data (if ingested)
# Note: The path depends on the exact dataset name used in ingestion script.
# Assuming 'csebuetnlp/banglanmt' was used.
parallel_path = '../../data/csebuetnlp_banglanmt_train_parallel' # You might need to adjust this path if different
show_samples(parallel_path, 'Parallel Data (En-Bn)')