# InsightfulPy Example

This notebook demonstrates the complete InsightfulPy workflow for exploratory data analysis.

## Table of Contents
1. [Setup and Data Loading](#setup)
2. [Initial Data Overview](#overview)
3. [Data Quality Assessment](#quality)
4. [Statistical Summary](#summary)
5. [Data Visualization](#visualization)
6. [Relationship Analysis](#relationships)
7. [Advanced Analysis](#advanced)
8. [Conclusions](#conclusions)

## 1. Setup and Data Loading {#setup}

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import insightfulpy as ipy
import warnings
warnings.filterwarnings('ignore')

# Display all help options
print("InsightfulPy Help System:")
print("=" * 25)
print("Available help functions:")
print("- ipy.help()         # Comprehensive overview")
print("- ipy.quick_start()  # Quick start guide")
print("- ipy.examples()     # Usage examples")
print("- ipy.list_all()     # All functions")

In [None]:
# Create sample dataset for demonstration
np.random.seed(42)
n_samples = 1000

# Generate sample data
data = {
    'customer_id': range(1, n_samples + 1),
    'age': np.random.normal(35, 12, n_samples).astype(int),
    'income': np.random.exponential(50000, n_samples),
    'spending_score': np.random.beta(2, 5, n_samples) * 100,
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_samples, p=[0.3, 0.3, 0.2, 0.2]),
    'category': np.random.choice(['Premium', 'Standard', 'Basic'], n_samples, p=[0.2, 0.5, 0.3]),
    'satisfaction': np.random.choice(['Very Low', 'Low', 'Medium', 'High', 'Very High'], n_samples),
    'purchase_amount': np.random.gamma(2, 1000, n_samples)
}

# Introduce some missing values
missing_indices = np.random.choice(n_samples, 50, replace=False)
data['income'][missing_indices[:25]] = np.nan
data['satisfaction'][missing_indices[25:]] = np.nan

# Create DataFrame
df = pd.DataFrame(data)

# Add some outliers
outlier_indices = np.random.choice(n_samples, 20, replace=False)
df.loc[outlier_indices, 'income'] = df['income'].quantile(0.95) * 3

print(f"Dataset created with {len(df)} rows and {len(df.columns)} columns")
print("\nFirst 5 rows:")
df.head()

## 2. Initial Data Overview {#overview}

In [None]:
# Get comprehensive dataset overview
ipy.columns_info('Customer Analysis Dataset', df)

In [None]:
# Quick help reminder
ipy.help()

## 3. Data Quality Assessment {#quality}

In [None]:
# Check for missing and infinite values
ipy.missing_inf_values(df, missing=True, inf=True)

In [None]:
# Visualize missing data patterns
ipy.show_missing(df)

In [None]:
# Check for mixed data types
ipy.detect_mixed_data_types(df)

In [None]:
# Detect outliers
outliers = ipy.detect_outliers(df)
print("Outlier Analysis Results:")
outliers

## 4. Statistical Summary {#summary}

In [None]:
# Comprehensive numerical summary
print("NUMERICAL COLUMNS SUMMARY")
print("=" * 30)
num_summary = ipy.num_summary(df)
num_summary

In [None]:
# Categorical columns summary
print("CATEGORICAL COLUMNS SUMMARY")
print("=" * 32)
cat_summary = ipy.cat_summary(df)
cat_summary

In [None]:
# Distribution characteristics
print("DISTRIBUTION CHARACTERISTICS")
print("=" * 30)
dist_stats = ipy.calculate_skewness_kurtosis(df)
dist_stats

## 5. Data Visualization {#visualization}

In [None]:
# Box plots for outlier visualization
print("Box Plots - Outlier Detection")
ipy.plot_boxplots(df)

In [None]:
# Distribution analysis in batches
print("Available KDE plot batches:")
kde_batches = ipy.kde_batches(df)
kde_batches

In [None]:
# Plot KDE distributions
ipy.kde_batches(df, batch_num=1)

In [None]:
# Categorical data visualization
print("Available categorical plot batches:")
cat_batches = ipy.cat_bar_batches(df)
cat_batches

In [None]:
# Plot categorical distributions
ipy.cat_bar_batches(df, batch_num=1, show_percentage=True)

In [None]:
# Q-Q plots for normality assessment
ipy.qq_plot_batches(df, batch_num=1)

## 6. Relationship Analysis {#relationships}

In [None]:
# Numerical vs numerical relationships
print("Available numerical relationship pairs:")
num_pairs = ipy.num_vs_num_scatterplot_pair_batch(df)
num_pairs.head(10)

In [None]:
# Plot numerical relationships
ipy.num_vs_num_scatterplot_pair_batch(df, pair_num=1, batch_num=1, hue_column='category')

In [None]:
# Categorical vs categorical relationships
print("Available categorical relationship pairs:")
cat_pairs = ipy.cat_vs_cat_pair_batch(df)
cat_pairs.head(5)

In [None]:
# Plot categorical relationships
ipy.cat_vs_cat_pair_batch(df, pair_num=0, batch_num=1)

In [None]:
# Numerical vs categorical analysis
print("Available numerical vs categorical pairs:")
mixed_pairs = ipy.num_vs_cat_box_violin_pair_batch(df)
mixed_pairs.head(5)

In [None]:
# Plot numerical vs categorical relationships
ipy.num_vs_cat_box_violin_pair_batch(df, pair_num=1, batch_num=1)

## 7. Advanced Analysis {#advanced}

In [None]:
# Group analysis by category
print("GROUP ANALYSIS BY CATEGORY")
print("=" * 30)
grouped_analysis = ipy.grouped_summary(df, groupby='category')
print(grouped_analysis)

In [None]:
# Individual column deep dive - Income analysis
print("INCOME ANALYSIS BY CATEGORY")
print("=" * 30)
ipy.num_analysis_and_plot(df, 'income', target='category', visualize=True)

In [None]:
# Regional analysis
print("REGIONAL ANALYSIS")
print("=" * 18)
ipy.cat_analyze_and_plot(df, 'region', target='category', visualize=True)

In [None]:
# Custom statistical analysis
print("CUSTOM STATISTICAL ANALYSIS - INCOME")
print("=" * 38)
income_stats = ipy.calc_stats(df['income'].dropna())
for stat, value in income_stats.items():
    print(f"{stat:15}: {value:,.2f}")

In [None]:
# Interconnected outlier analysis
print("INTERCONNECTED OUTLIER ANALYSIS")
print("=" * 33)
numeric_cols = ['age', 'income', 'spending_score', 'purchase_amount']
interconnected = ipy.interconnected_outliers(df, numeric_cols)
print(f"Found {len(interconnected)} rows with outliers in multiple columns")
if len(interconnected) > 0:
    print("\nFirst 5 interconnected outlier rows:")
    print(interconnected.head())

## 8. Conclusions {#conclusions}

In [None]:
print("ANALYSIS SUMMARY")
print("=" * 17)
print(f"Dataset Size: {len(df)} rows, {len(df.columns)} columns")
print(f"Missing Values: {df.isnull().sum().sum()} total")
print(f"Numerical Columns: {len(df.select_dtypes(include='number').columns)}")
print(f"Categorical Columns: {len(df.select_dtypes(include='object').columns)}")

# Key insights
print("\nKEY INSIGHTS:")
print("- Income shows right-skewed distribution with outliers")
print("- Spending scores follow beta distribution pattern")
print("- Regional distribution is fairly balanced")
print("- Premium customers show different spending patterns")
print("- Some customers are outliers across multiple metrics")

print("\nRECOMMENDATIONS:")
print("- Investigate high-income outliers for data quality")
print("- Consider log transformation for income analysis")
print("- Develop targeted strategies for different customer categories")
print("- Monitor interconnected outliers for potential data issues")

In [None]:
# Final reminder of help system
print("Remember: InsightfulPy provides comprehensive help!")
print("\nUse these functions anytime:")
print("- ipy.help()         # Function overview")
print("- ipy.quick_start()  # Getting started")
print("- ipy.examples()     # Code examples")
print("- ipy.list_all()     # All functions")
print("\nHappy analyzing! 📊")