In [None]:
# =========================================
# 1. Import Libraries
# =========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from ydata_profiling import ProfileReport

warnings.filterwarnings('ignore')
%matplotlib inline  # Jupyter Notebook me plots inline show karne ke liye

# =========================================
#  2. Load Dataset
# =========================================
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    print(f"✅ Dataset Loaded: {file_path}")
    return df

# =========================================
#  3. Data Overview
# =========================================
def data_overview(df):
    print("\n--- DATA OVERVIEW ---")
    print(f"Shape: {df.shape}")  # Example Output: (1000, 15)
    print("\nColumn Names and Data Types:\n", df.dtypes)
    print("\nMissing Value Summary:\n", df.isnull().sum())
    print(f"\nNumber of Duplicate Rows: {df.duplicated().sum()}")
    print("\nMemory Usage:")
    print(df.memory_usage(deep=True))

# =========================================
#  4. Univariate Analysis
# =========================================
def univariate_analysis(df):
    num_cols = df.select_dtypes(include=np.number).columns
    cat_cols = df.select_dtypes(exclude=np.number).columns

    print("\n--- SUMMARY STATISTICS ---")
    print(df.describe(include='all'))

    # Histogram
    df[num_cols].hist(figsize=(15, 10), bins=20)
    plt.tight_layout()
    plt.show()

    # Countplots
    for col in cat_cols:
        plt.figure(figsize=(8, 4))
        sns.countplot(x=df[col], order=df[col].value_counts().index)
        plt.xticks(rotation=45)
        plt.title(f"Count Plot of {col}")
        plt.tight_layout()
        plt.show()

# =========================================
#  5. Bivariate Analysis
# =========================================
def bivariate_analysis(df):
    num_cols = df.select_dtypes(include=np.number).columns
    cat_cols = df.select_dtypes(exclude=np.number).columns

    if len(num_cols) > 1:
        plt.figure(figsize=(10, 6))
        sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm")
        plt.title("Correlation Matrix")
        plt.show()

    if df.shape[0] <= 1000 and len(num_cols) <= 5:
        sns.pairplot(df[num_cols])
        plt.show()

    for col in num_cols:
        for cat in cat_cols:
            plt.figure(figsize=(8, 4))
            sns.boxplot(x=df[cat], y=df[col])
            plt.xticks(rotation=45)
            plt.title(f"{col} vs {cat}")
            plt.tight_layout()
            plt.show()

# =========================================
#  6. Data Quality Warnings
# =========================================
def data_quality_warnings(df):
    num_cols = df.select_dtypes(include=np.number).columns
    cat_cols = df.select_dtypes(exclude=np.number).columns

    print("\n--- DATA QUALITY WARNINGS ---")
    high_card = [col for col in cat_cols if df[col].nunique() > 50]
    print("High Cardinality Columns:", high_card)

    skewness = df[num_cols].skew()
    print("\nSkewed Columns:\n", skewness[abs(skewness) > 1])

    missing_cols = [col for col in df.columns if df[col].isnull().mean() > 0.4]
    print("\nColumns with too many missing values:", missing_cols)

    low_variance = [col for col in df.columns if df[col].nunique() == 1]
    print("\nLow Variance Columns:", low_variance)

    return missing_cols

# =========================================
#  7. Cleaning & Comparison
# =========================================
def clean_data(df, missing_cols):
    before_shape = df.shape
    df_cleaned = df.drop_duplicates()
    df_cleaned = df_cleaned.drop(columns=missing_cols)
    after_shape = df_cleaned.shape

    comparison = pd.DataFrame({
        "Before Cleaning": [before_shape[0], before_shape[1]],
        "After Cleaning": [after_shape[0], after_shape[1]]
    }, index=["Rows", "Columns"])

    print("\n--- BEFORE vs AFTER CLEANING ---")
    print(comparison)

    return df_cleaned

# =========================================
#  8. Generate HTML Report
# =========================================
def generate_report(df, filename):
    profile = ProfileReport(df, title=f"EDA Report - {filename}", explorative=True)
    profile.to_file(f"{filename}.html")
    print(f"✅ Report Generated: {filename}.html")

# =========================================
#  9. Run Full EDA
# =========================================
def run_eda(file_path):
    df = load_dataset(file_path)
    data_overview(df)
    univariate_analysis(df)
    bivariate_analysis(df)
    missing_cols = data_quality_warnings(df)
    generate_report(df, "EDA_Report_Before")
    df_cleaned = clean_data(df, missing_cols)
    generate_report(df_cleaned, "EDA_Report_After")
    print("\n✅ All EDA Steps Completed Successfully!")

# =========================================
# 📌 10. Run for Dataset
# =========================================
run_eda("your_dataset.csv")

#EXAMPLE Output (Console)

"""
 Dataset Loaded: your_dataset.csv

--- DATA OVERVIEW ---
Shape: (1000, 15)

Column Names and Data Types:
id                 int64
age                float64
gender             object
salary             float64
...

Missing Value Summary:
id           0
age          5
gender       0
salary       12
...

Number of Duplicate Rows: 3

Memory Usage:
Index           128
id             8000
age            8000
gender         8000
salary         8000
...

--- SUMMARY STATISTICS ---
               id      age     salary  gender
count  1000.0000  995.000  988.000000   1000
mean   500.5000   35.200  55000.12345   NaN
...

--- DATA QUALITY WARNINGS ---
High Cardinality Columns: ['name']
Skewed Columns:
 age       1.45
salary    2.34
dtype: float64

Columns with too many missing values: ['address']
Low Variance Columns: ['country']

--- BEFORE vs AFTER CLEANING ---
         Before Cleaning  After Cleaning
Rows                1000             997
Columns               15              14

 Report Generated: EDA_Report_Before.html
 Report Generated: EDA_Report_After.html
 All EDA Steps Completed Successfully!

"""