# Data Exploration

This notebook explores the raw data to understand its structure, distribution, and characteristics.


In [None]:
# Import standard libraries
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set up paths
project_root = Path().resolve().parent.parent.parent
sys.path.insert(0, str(project_root))

# Data paths
data_dir = project_root / "science" / "data"
raw_data_dir = data_dir / "raw"
processed_data_dir = data_dir / "processed"
interim_data_dir = data_dir / "interim"
external_data_dir = data_dir / "external"
output_data_dir = data_dir / "output"

# Models path
models_dir = project_root / "science" / "models"

# Import project package
from {{ cookiecutter.python_package }}.vision import load_image, preprocess_image, predict_simple
from {{ cookiecutter.python_package }}.utils import load_state, save_state

print(f"Project root: {project_root}")
print(f"Raw data directory: {raw_data_dir}")
print(f"Models directory: {models_dir}")


## Load and Explore Raw Data


In [None]:
# Example: List files in raw data directory
if raw_data_dir.exists():
    raw_files = list(raw_data_dir.glob("*"))
    print(f"Found {len(raw_files)} files in raw data directory")
    for f in raw_files[:10]:  # Show first 10
        print(f"  - {f.name}")
else:
    print(f"Raw data directory does not exist: {raw_data_dir}")
    print("Please add your raw data files here")


## Visualize Data Distribution


In [None]:
# Example: Load and visualize an image
# Uncomment and modify based on your data format

# if raw_data_dir.exists():
#     # Find image files
#     image_files = list(raw_data_dir.glob("*.jpg")) + list(raw_data_dir.glob("*.png"))
#     
#     if image_files:
#         # Load first image
#         sample_image_path = image_files[0]
#         image = load_image(sample_image_path)
#         
#         # Display image
#         plt.figure(figsize=(10, 6))
#         plt.imshow(image)
#         plt.title(f"Sample image: {sample_image_path.name}")
#         plt.axis('off')
#         plt.show()
#         
#         print(f"Image shape: {image.shape}")
#         print(f"Image dtype: {image.dtype}")
#         print(f"Image min/max: {image.min()}/{image.max()}")
#     else:
#         print("No image files found in raw data directory")
# else:
#     print("Raw data directory does not exist")


## Data Statistics


In [None]:
# Example: Compute basic statistics
# Modify based on your data format

# if raw_data_dir.exists():
#     image_files = list(raw_data_dir.glob("*.jpg")) + list(raw_data_dir.glob("*.png"))
#     
#     if image_files:
#         shapes = []
#         for img_path in image_files[:100]:  # Sample first 100
#             try:
#                 img = load_image(img_path)
#                 shapes.append(img.shape)
#             except Exception as e:
#                 print(f"Error loading {img_path}: {e}")
#         
#         if shapes:
#             shapes_array = np.array(shapes)
#             print(f"Image shapes - Mean: {shapes_array.mean(axis=0)}, Std: {shapes_array.std(axis=0)}")
#             print(f"Total images analyzed: {len(shapes)}")
#     else:
#         print("No image files found")
