# 01 Data Exploration Notebook
 This notebook performs an initial exploration of a dataset, including loading, inspecting, and summarizing
 its features and target variable.


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

## Block 1: Data Loading

In [9]:
import pandas as pd
from data_preprocessing import DataLoader

# Initialize data loader
loader = DataLoader()

# Load iris dataset
df = loader.load_sklearn_dataset("iris", save_raw=True)
print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

Dataset saved to C:\Users\nobu\PyCharmMiscProject\data\raw\iris.csv
Dataset loaded successfully!
Dataset shape: (150, 5)


## Block 2: Initial Data Inspection

In [10]:
# Display first few rows
print("First few rows of the dataset:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nBasic statistics:")
print(df.describe())

First few rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target    

## Block 3: Missing Values and Data Quality

In [11]:
from data_preprocessing import get_dataset_info

# Get comprehensive dataset information
info = get_dataset_info(df)

print("Data Quality Report:")
print(f"  - Shape: {info['shape']}")
print(f"  - Missing values: {sum(info['missing_values'].values())}")
print(f"  - Duplicates: {info['duplicates']}")
print(f"  - Memory usage: {info['memory_usage_mb']:.2f} MB")

print("\nMissing values by column:")
for col, count in info['missing_values'].items():
    print(f"  {col}: {count}")

Data Quality Report:
  - Shape: (150, 5)
  - Missing values: 0
  - Duplicates: 1
  - Memory usage: 0.01 MB

Missing values by column:
  sepal length (cm): 0
  sepal width (cm): 0
  petal length (cm): 0
  petal width (cm): 0
  target: 0


## Block 4: Feature Analysis

In [12]:
print("Feature Analysis:")
print(f"  - Numeric columns: {info['numeric_columns']}")
print(f"  - Categorical columns: {info['categorical_columns']}")

print("\nNumeric columns summary:")
numeric_df = df[info['numeric_columns']]
print(numeric_df.describe())

if info['categorical_columns']:
    print("\nCategorical columns summary:")
    for col in info['categorical_columns']:
        print(f"  {col}: {df[col].nunique()} unique values")

Feature Analysis:
  - Numeric columns: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'target']
  - Categorical columns: []

Numeric columns summary:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  


Block 5: Target Variable Distribution

In [13]:
# Analyze target variable distribution
target_col = 'target'
if target_col in df.columns:
    print(f"Target variable '{target_col}' distribution:")
    print(df[target_col].value_counts().sort_index())
    print(f"\nClass distribution (%):")
    print((df[target_col].value_counts(normalize=True).sort_index() * 100).round(2))
else:
    print("No 'target' column found in dataset")

Target variable 'target' distribution:
target
0    50
1    50
2    50
Name: count, dtype: int64

Class distribution (%):
target
0    33.33
1    33.33
2    33.33
Name: proportion, dtype: float64


## Block 6: Data Exploration Summary

In [14]:
print("\n" + "="*60)
print("✓ Dataset exploration completed!")
print("="*60)
print(f"\nSummary:")
print(f"  - Total samples: {df.shape[0]}")
print(f"  - Total features: {df.shape[1]}")
print(f"  - Data types: {df.dtypes.nunique()} unique types")
print(f"  - Complete rows: {len(df.dropna())}")
print("="*60)


✓ Dataset exploration completed!

Summary:
  - Total samples: 150
  - Total features: 5
  - Data types: 2 unique types
  - Complete rows: 150
