# Exploratory Data Analysis (EDA) Workflow

This notebook provides a complete EDA workflow including:
- Data loading and inspection
- Statistical summaries
- Data visualization
- Data quality checks
- Report generation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Inspect Data

In [None]:
# Load data
# df = pd.read_csv('your_data.csv')

# Basic info
print('Dataset Shape:', df.shape)
print('\nColumn Names:')
print(df.columns.tolist())
print('\nData Types:')
print(df.dtypes)
print('\nFirst Few Rows:')
df.head()

## 2. Statistical Summary

In [None]:
# Numerical summary
df.describe()

# Categorical summary
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f'\n{col}:')
    print(df[col].value_counts())

## 3. Data Quality Checks

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

# Duplicates
print(f'\nDuplicate Rows: {df.duplicated().sum()}')

## 4. Data Visualization

In [None]:
# Distribution plots for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
for col in numerical_cols[:5]:  # First 5 numerical columns
    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    df[col].hist(bins=30)
    plt.title(f'Distribution of {col}')
    plt.subplot(1, 2, 2)
    df[col].boxplot()
    plt.title(f'Box Plot of {col}')
    plt.tight_layout()
    plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
if len(numerical_cols) > 1:
    corr_matrix = df[numerical_cols].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()