In [1]:
# Question: Advanced Data Profiling and Outlier Detection
# Description: Perform detailed data profiling including outlier detection for numeric columns.




In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

def advanced_data_profiling(df):
    """
    Performs advanced data profiling and outlier detection for numeric columns
    in a Pandas DataFrame.

    Args:
        df: The Pandas DataFrame to profile.

    Returns:
        A dictionary containing profiling information and identified outliers for
        each numeric column.
    """

    profile_report = {}
    outlier_report = {}

    numeric_cols = df.select_dtypes(include=['number']).columns

    for col in numeric_cols:
        print(f"--- Profiling Column: '{col}' ---")

        # 1. Basic Descriptive Statistics (already covered by .describe())
        print(df[col].describe())
        print("\n")

        # 2. Missing Values (re-check for completeness)
        missing_count = df[col].isnull().sum()
        missing_percentage = (missing_count / len(df)) * 100
        print(f"Missing Values: {missing_count} ({missing_percentage:.2f}%)")
        print("\n")

        # 3. Distribution Visualization
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        sns.histplot(df[col], kde=True)
        plt.title(f"Distribution of '{col}'")
        plt.xlabel(col)
        plt.ylabel("Frequency")

        plt.subplot(1, 2, 2)
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot of '{col}'")
        plt.xlabel(col)

        plt.tight_layout()
        plt.show()

        # 4. Outlier Detection (using IQR method)
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers_iqr = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col].tolist()