In [None]:
# test version-cv dataloader
#!/usr/bin/env python
# coding: utf-8

"""
test_dataloader.py

Quick tests for the unified dataloader.py module.
"""

import numpy as np
import pandas as pd
from dataloader import load_data
from sklearn.feature_extraction.text import TfidfVectorizer

# === Example: Computer Vision ===
print("\n--- Computer Vision Example ---")
data_dir = "data/images"  # Replace with your actual image directory path

try:
    train_gen, val_gen = load_data(
        data_path=data_dir,
        data_type="image",
        batch_size=8,
        augment=True
    )
    batch_imgs, batch_labels = next(train_gen)
    print("Image batch shape:", batch_imgs.shape)
    print("Label batch shape:", batch_labels.shape)
except Exception as e:
    print("CV example skipped (update path):", e)

# === Example: Text ===
print("\n--- Text Example ---")
text_file = "data/descriptions.txt"  # Replace with your text file path

# Example vectorizer (can be ESA/SA in your real project)
example_vectorizer = TfidfVectorizer(max_features=1000)

try:
    # Pre-fit vectorizer for demo
    with open(text_file, encoding="utf-8") as f:
        text_lines = [line.strip() for line in f if line.strip()]
    example_vectorizer.fit(text_lines)

    texts, tokens, vectors = load_data(
        data_path=text_file,
        data_type="text",
        tokenizer_name="t5-small",
        max_text_len=64,
        vectorizer=example_vectorizer
    )
    print("Text sample:", texts[:2])
    print("Token shape (input_ids):", tokens["input_ids"].shape)
    if vectors is not None:
        print("Vector shape:", vectors.shape)
except Exception as e:
    print("Text example skipped (update path):", e)

# === Example: Tabular ===
print("\n--- Tabular Example ---")
tab_file = "data/sample.csv"  # Replace with your CSV or Parquet file path

try:
    # Load and vectorize
    df, tab_vectors = load_data(
        data_path=tab_file,
        data_type="tabular",
        vectorizer=example_vectorizer
    )
    print("Tabular shape:", df.shape)
    if tab_vectors is not None:
        print("Tab vector shape:", tab_vectors.shape)
except Exception as e:
    print("Tabular example skipped (update path):", e)

# === Example: InkML ===
print("\n--- InkML Example ---")
inkml_dir = "data/inkml_files"  # Replace with your .inkml directory path

try:
    inkml_data = load_data(
        data_path=inkml_dir,
        data_type="inkml"
    )
    print("Loaded InkML samples:", len(inkml_data))
    print("First trace example:", inkml_data[0][:5] if inkml_data else "No data")
except Exception as e:
    print("InkML example skipped (update path):", e)