In [1]:
from utils import base_configs, deps, tr_va_te_split
from utils.helpers import dir_helpers, rw_csv_helpers, feature_distr_helpers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys, math
import import_ipynb
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### 0 Data load

In [3]:
df_hints6 = "ip/3_cleanedEncoded/hints6_public_filtered_v1_cleaned_encoded.csv"
df_hints7 = "ip/3_cleanedEncoded/....csv"
df_hints6_7 = "ip/3_cleanedEncoded/hints6_7_cleaned_encoded.csv"
df_orig = rw_csv_helpers.read_csv_file(df_hints6, verbose = 1)

Loaded: /home/ppanta/puru_proj/proj_v0/hints6_v0/ip/3_cleanedEncoded/hints6_public_filtered_v1_cleaned_encoded.csv
────────────────────────────────────────────────────────────────────────────────
Shape: (4865, 27)
────────────────────────────────────────────────────────────────────────────────
All columns: ['FreqGoProvider', 'Deaf', 'MedConditions_Diabetes', 'MedConditions_HighBP', 'MedConditions_HeartCondition', 'MedConditions_LungDisease', 'MedConditions_Depression', 'AverageSleepNight', 'AverageTimeSitting', 'EverHadCancer', 'Age', 'BirthGender', 'BMI', 'PHQ4', 'WeeklyMinutesModerateExercise', 'AvgDrinksPerWeek', 'GeneralHealth_Excellent', 'GeneralHealth_VeryGood', 'GeneralHealth_Good', 'GeneralHealth_Fair', 'GeneralHealth_Poor', 'smokeStat_Current', 'smokeStat_Former', 'smokeStat_Never', 'eCigUse_Current', 'eCigUse_Former', 'eCigUse_Never']


In [4]:
counts = feature_distr_helpers.count01(df_orig.copy(), "MedConditions_HeartCondition", verbose=1)

Counts for column 'MedConditions_HeartCondition' (only 0 and 1):
MedConditions_HeartCondition
0    4412
1     453
Name: count, dtype: int64
Total (0/1 only): 4865


### 1 Data summary

In [5]:
# Quick overview
print(df_orig.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4865 entries, 0 to 4864
Data columns (total 27 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   FreqGoProvider                 4865 non-null   int64  
 1   Deaf                           4865 non-null   int64  
 2   MedConditions_Diabetes         4865 non-null   int64  
 3   MedConditions_HighBP           4865 non-null   int64  
 4   MedConditions_HeartCondition   4865 non-null   int64  
 5   MedConditions_LungDisease      4865 non-null   int64  
 6   MedConditions_Depression       4865 non-null   int64  
 7   AverageSleepNight              4865 non-null   int64  
 8   AverageTimeSitting             4865 non-null   int64  
 9   EverHadCancer                  4865 non-null   int64  
 10  Age                            4865 non-null   int64  
 11  BirthGender                    4865 non-null   int64  
 12  BMI                            4865 non-null   f

In [6]:
import pandas as pd
import numpy as np
from typing import Sequence, Optional, Iterable

# ---------- helpers ----------
def is_binary_series(s: pd.Series, valid: Sequence = (0, 1)) -> bool:
    non_na = s.dropna()
    return (not non_na.empty) and non_na.isin(valid).all()

def find_binary_columns(df: pd.DataFrame, valid: Sequence = (0, 1)) -> list[str]:
    return [col for col in df.columns if is_binary_series(df[col], valid=valid)]

def series_has_decimals(s: pd.Series) -> bool:
    """
    True if any non-NA numeric value has a fractional part.
    Treats values like 3.0 as integers.
    """
    s_num = pd.to_numeric(s, errors="coerce")
    s_num = s_num.dropna()
    if s_num.empty:
        return False
    # Check fractional part
    frac = (s_num - np.floor(s_num)).abs()
    return (frac > 0).any()

def binary_counts(df: pd.DataFrame, binary_cols: Optional[Iterable[str]] = None) -> pd.DataFrame:
    if binary_cols is None:
        binary_cols = find_binary_columns(df)
    if not binary_cols:
        return pd.DataFrame(columns=["Count of 1s", "Count of 0s", "Count of 0s + 1s"])
    sub = df[list(binary_cols)]
    ones = sub.sum(numeric_only=True).rename("Count of 1s")
    zeros = (sub == 0).sum().rename("Count of 0s")
    total = (ones + zeros).rename("Count of 0s + 1s")
    return pd.concat([ones, zeros, total], axis=1)

def combined_summary_table(
    df: pd.DataFrame,
    include: Optional[str | Sequence[str]] = "number",
    valid_binary_values: Sequence = (0, 1),
) -> pd.DataFrame:
    desc = df.describe(include=include).T  # index = feature name
    bin_df = binary_counts(df, find_binary_columns(df, valid=valid_binary_values))
    return desc.join(bin_df, how="left")

def _show_table(df: pd.DataFrame, max_rows: int = 200, max_cols: int = 50) -> None:
    try:
        from IPython.display import display
        with pd.option_context(
            "display.max_rows", max_rows,
            "display.max_columns", max_cols,
            "display.width", 0
        ):
            display(df)
    except Exception:
        print(df.to_string(max_rows=max_rows, max_cols=max_cols))

# ---------- one-call API ----------
from pathlib import Path
import numpy as np
import pandas as pd
from typing import Optional, Sequence

def save_and_show_summary_csv(
    df: pd.DataFrame,
    path: str,
    include: Optional[str | Sequence[str]] = "number",
    valid_binary_values: Sequence = (0, 1),
    mean_std_decimals: int = 3,   # decimals only for mean/std and (conditionally) min/max
    na_blank: bool = True,
    show: bool = True,
    max_rows: int = 200,
    max_cols: int = 50,
) -> pd.DataFrame:
    """
    Build combined summary; format:
      - mean/std -> fixed decimals (default 3)
      - min/max  -> decimals only if original feature had fractional values; else integers
      - all other numeric columns -> integers
      - NaNs -> blank (if na_blank)
    Displays the table and writes CSV. Returns the formatted (string) DataFrame.
    """
    combined = combined_summary_table(df, include=include, valid_binary_values=valid_binary_values)

    # Determine which features had any decimals in the original data
    decimal_features = {col for col in df.columns if series_has_decimals(df[col])}

    dec_cols_global = [c for c in ["mean", "std"] if c in combined.columns]  # always decimal
    per_feature_decimal_cols = [c for c in ["min", "max"] if c in combined.columns]

    def _fmt_val(val, col_name: str, feature: str):
        if pd.isna(val):
            return "" if na_blank else "NaN"

        # mean/std: always show with fixed decimals
        if col_name in dec_cols_global:
            try:
                return f"{float(val):.{mean_std_decimals}f}"
            except Exception:
                return str(val)

        # min/max: decimal only if the original feature had fractional values
        if col_name in per_feature_decimal_cols and (feature in decimal_features):
            try:
                return f"{float(val):.{mean_std_decimals}f}"
            except Exception:
                return str(val)

        # everything else: integer-like, no decimals
        if isinstance(val, (float, np.floating, int, np.integer)):
            try:
                return f"{int(round(float(val)))}"
            except Exception:
                return str(val)

        return str(val)

    # Copy for display/export and cast to object so string assignment is safe
    formatted = combined.copy().astype("object")

    # Apply formatting row-by-row (feature = index)
    for feature in formatted.index:
        for col_name in formatted.columns:
            formatted.at[feature, col_name] = _fmt_val(
                combined.at[feature, col_name],  # use numeric source
                col_name,
                feature,
            )

    if show:
        _show_table(formatted, max_rows=max_rows, max_cols=max_cols)

    # Ensure directory exists before saving
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    formatted.to_csv(path, index=True, encoding="utf-8", na_rep="")

    return formatted


### 3 Display and save resulting output

In [7]:
out_csv = "op/1_data_explore_2/summary_statistics.csv"
combined = save_and_show_summary_csv(df_orig.copy(), out_csv, show=True, max_rows=300)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Count of 1s,Count of 0s,Count of 0s + 1s
FreqGoProvider,4865,2.959,1.839,0.0,1,3,5,6.0,,,
Deaf,4865,0.087,0.282,0.0,0,0,0,1.0,425.0,4440.0,4865.0
MedConditions_Diabetes,4865,0.199,0.399,0.0,0,0,0,1.0,969.0,3896.0,4865.0
MedConditions_HighBP,4865,0.432,0.495,0.0,0,0,1,1.0,2101.0,2764.0,4865.0
MedConditions_HeartCondition,4865,0.093,0.291,0.0,0,0,0,1.0,453.0,4412.0,4865.0
MedConditions_LungDisease,4865,0.135,0.342,0.0,0,0,0,1.0,656.0,4209.0,4865.0
MedConditions_Depression,4865,0.27,0.444,0.0,0,0,1,1.0,1314.0,3551.0,4865.0
AverageSleepNight,4865,6.908,1.404,0.0,6,7,8,24.0,,,
AverageTimeSitting,4865,6.814,3.575,0.0,4,6,9,20.0,,,
EverHadCancer,4865,0.149,0.356,0.0,0,0,0,1.0,723.0,4142.0,4865.0
