<a href="https://colab.research.google.com/github/shrutimalik123/python-collab-1/blob/main/Simple_Data_Profiler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# --- 1. Simulate Input Data ---
# In a real scenario, this data would be loaded from a CSV, database (via SQL), or Cloud storage.
# For this exercise, we create a small, messy dataset.
data = {
    'Patient_ID': range(101, 116),
    'Age': [35, 52, 78, 44, np.nan, 61, 29, 88, 55, 41, 67, 30, 50, 72, np.nan],
    'Systolic_BP': [130, 145, 160, 125, 135, 150, 120, 170, 140, 128, 155, 118, 138, 158, 142],
    'Diastolic_BP': [85, 95, 105, 80, 90, 98, 75, 110, 88, 82, 102, 72, 86, 100, 92],
    'Diagnosis_Code': ['I10', 'E11', np.nan, 'I10', 'J45', 'I10', 'E11', 'I10', 'J45', 'I10', 'E11', 'J45', 'I10', 'E11', 'J45'],
    'Insurance_Status': ['Active', 'Active', 'Inactive', 'Active', 'Active', 'Inactive', 'Active', 'Active', 'Inactive', 'Active', 'Active', 'Active', 'Inactive', 'Active', 'Active']
}
df = pd.DataFrame(data)

# --- 2. Data Cleaning and Preprocessing ---

def clean_and_profile_data(dataframe):
    """
    Cleans the dataframe (handling NaNs) and performs basic data profiling.
    """
    print("--- Initial Data Snapshot (Head) ---")
    print(dataframe.head())
    print("\n--- Initial Missing Value Count ---")
    print(dataframe.isnull().sum())

    # Strategy 1: Fill Missing 'Age' with the mean (Common for numerical features)
    mean_age = dataframe['Age'].mean()
    dataframe['Age'].fillna(mean_age, inplace=True)

    # Strategy 2: Fill Missing Categorical 'Diagnosis_Code' with a placeholder
    dataframe['Diagnosis_Code'].fillna('UNKNOWN', inplace=True)

    print("\n--- Data Cleaning Complete ---")
    print("Age filled with mean: {:.2f}".format(mean_age))
    print("Diagnosis_Code filled with 'UNKNOWN'")
    print("\n--- Final Missing Value Check ---")
    print(dataframe.isnull().sum())

    return dataframe

cleaned_df = clean_and_profile_data(df.copy()) # Use a copy to avoid modifying the original

# --- 3. Data Profiling and Feature Engineering ---

def generate_insights(dataframe):
    """
    Calculates key descriptive statistics and creates a new derived feature.
    """
    print("\n--- Descriptive Statistics for Key Features ---")

    # Calculate overall stats for Age
    age_stats = dataframe['Age'].describe()
    print("\nAge Statistics:\n", age_stats)

    # Calculate statistics for Blood Pressure
    print("\nBlood Pressure (BP) Statistics:")
    print("Mean Systolic BP: {:.2f}".format(dataframe['Systolic_BP'].mean()))
    print("Max Diastolic BP: {}".format(dataframe['Diastolic_BP'].max()))

    # Create a new, derived feature (Heart Rate Approximation - simplified for demo)
    # This is a common step to generate features for predictive models.
    dataframe['MAP'] = (2 * dataframe['Diastolic_BP'] + dataframe['Systolic_BP']) / 3
    print("\nSuccessfully calculated new feature 'MAP' (Mean Arterial Pressure).")
    print("MAP for first 5 patients:\n", dataframe['MAP'].head())

    # Analyze categorical feature distribution
    print("\n--- Diagnosis Code Distribution ---")
    print(dataframe['Diagnosis_Code'].value_counts())

# Run the final analysis
generate_insights(cleaned_df)

# --- 4. Prepare for MLOps (Optional but good practice) ---
# When you save the cleaned data, it's often saved as a parquet file for ML readiness.
# cleaned_df.to_parquet('cleaned_patient_data.parquet', index=False)
# print("\nCleaned data saved to 'cleaned_patient_data.parquet'")