# 01 Data Exploration Notebook
 This notebook performs an initial exploration of a dataset, including loading, inspecting, and summarizing
 its features and target variable.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

## Block 1: Data Loading

In [None]:
import pandas as pd
from data_preprocessing import DataLoader

# Initialize data loader
loader = DataLoader()

# Load iris dataset
df = loader.load_sklearn_dataset("iris", save_raw=True)
print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

## Block 2: Initial Data Inspection

In [None]:
# Display first few rows
print("First few rows of the dataset:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nBasic statistics:")
print(df.describe())

## Block 3: Missing Values and Data Quality

In [None]:
from data_preprocessing import get_dataset_info

# Get comprehensive dataset information
info = get_dataset_info(df)

print("Data Quality Report:")
print(f"  - Shape: {info['shape']}")
print(f"  - Missing values: {sum(info['missing_values'].values())}")
print(f"  - Duplicates: {info['duplicates']}")
print(f"  - Memory usage: {info['memory_usage_mb']:.2f} MB")

print("\nMissing values by column:")
for col, count in info['missing_values'].items():
    print(f"  {col}: {count}")

## Block 4: Feature Analysis

In [None]:
print("Feature Analysis:")
print(f"  - Numeric columns: {info['numeric_columns']}")
print(f"  - Categorical columns: {info['categorical_columns']}")

print("\nNumeric columns summary:")
numeric_df = df[info['numeric_columns']]
print(numeric_df.describe())

if info['categorical_columns']:
    print("\nCategorical columns summary:")
    for col in info['categorical_columns']:
        print(f"  {col}: {df[col].nunique()} unique values")

Block 5: Target Variable Distribution

In [None]:
# Analyze target variable distribution
target_col = 'target'
if target_col in df.columns:
    print(f"Target variable '{target_col}' distribution:")
    print(df[target_col].value_counts().sort_index())
    print(f"\nClass distribution (%):")
    print((df[target_col].value_counts(normalize=True).sort_index() * 100).round(2))
else:
    print("No 'target' column found in dataset")

## Block 6: Data Exploration Summary

In [None]:
print("\n" + "="*60)
print("âœ“ Dataset exploration completed!")
print("="*60)
print(f"\nSummary:")
print(f"  - Total samples: {df.shape[0]}")
print(f"  - Total features: {df.shape[1]}")
print(f"  - Data types: {df.dtypes.nunique()} unique types")
print(f"  - Complete rows: {len(df.dropna())}")
print("="*60)