# Data Cleaning Workflow with edaflow
This notebook demonstrates common data cleaning steps using edaflow utilities.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import edaflow as eda

## 1. Load Sample Data
Synthetic dataset with missing values and outliers.

In [None]:
df = pd.DataFrame({
    'feature1': [2, np.nan, 1, 5, 100, 2, 3, 4, np.nan, 1],
    'feature2': [7, 8, np.nan, 5, 7, 8, 6, 5, 7, 8],
    'category': ['A', 'B', 'A', 'C', 'A', 'B', 'C', 'A', 'B', 'C']
})
df.head()

## 2. Visualize Missing Values
Use edaflow to highlight missing data.

In [None]:
eda.highlight_anomalies(df)

## 3. Impute Missing Values
Fill missing values with mean for numeric columns.

In [None]:
df['feature1'] = df['feature1'].fillna(df['feature1'].mean())
df['feature2'] = df['feature2'].fillna(df['feature2'].mean())

## 4. Detect and Handle Outliers
Use edaflow to visualize and optionally cap outliers.

In [None]:
eda.highlight_anomalies(df, method='outlier')
df['feature1'] = np.where(df['feature1'] > 50, 50, df['feature1'])

## 5. Group Rare Categories
Group rare categories in categorical columns.

In [None]:
df['category'] = eda.group_rare_categories(df['category'], threshold=0.2)

## 6. Final Cleaned Data
Display the cleaned DataFrame.

In [None]:
df.head()