In [None]:
import numpy as np
import pandas as pd
from scipy import stats

path = "/home/alpaco/sryang/Lungca_prediction/survey lung cancer.csv"  
df = pd.read_csv(path)

df.columns = [c.strip() for c in df.columns]
for c in df.columns:
    if df[c].dtype == "O":
        df[c] = df[c].astype(str).str.strip()
        
group_var = "LUNG_CANCER" # or GENDER 
if group_var not in df.columns:
    raise ValueError(f"{group_var} not in columns: {df.columns.tolist()}")
if df[group_var].dtype == "O":
    df[group_var] = (
        df[group_var]
        .str.upper().str.strip()
        .replace({"Y": "YES", "N": "NO", "1": "YES", "0": "NO"})
    )

grp_vals = sorted(pd.Series(df[group_var]).dropna().unique().tolist())
if len(grp_vals) != 2:
    raise ValueError(f"Group '{group_var}' must have exactly 2 values, got {grp_vals}")
g1, g2 = grp_vals[0], grp_vals[1]

features = [
    'GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
    'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
    'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
    'SWALLOWING DIFFICULTY', 'CHEST PAIN'
]
features = [f for f in features if f in df.columns and f != group_var]

def is_small_discrete(series: pd.Series) -> bool:
    """0/1, 1/2 등 소수의 범주를 가지는 이산형(또는 그에 준하는) 변수 판정"""
    s = series.dropna().unique()
    if len(s) <= 10:
        if pd.api.types.is_integer_dtype(series) or pd.api.types.is_bool_dtype(series) or series.dtype == "O":
            return True
        if pd.api.types.is_float_dtype(series):
            if np.all(np.equal(np.mod(s, 1), 0)): 
                return True
    return False

results = []

for col in features:
    a = df.loc[df[group_var] == g1, col].dropna()
    b = df.loc[df[group_var] == g2, col].dropna()
    if len(a) == 0 or len(b) == 0:
        results.append({'variable': col, 'test': 'NA', 'stat': np.nan, 'p': np.nan, 'note': 'one group empty'})
        continue

    treat_as_categorical = (col != "AGE") and is_small_discrete(pd.concat([a, b], ignore_index=True))

    if treat_as_categorical:
        cats = sorted(pd.unique(pd.concat([a, b], ignore_index=True)))
        table = pd.crosstab(df[group_var], df[col], dropna=False).reindex(index=[g1, g2], columns=cats, fill_value=0)

        if table.shape == (2, 2):
            chi2, p_chi, dof, expected = stats.chi2_contingency(table.values, correction=False)
            if (expected < 5).any():
                oddsratio, p_val = stats.fisher_exact(table.values)
                results.append({'variable': col, 'test': 'Fisher exact (2x2)', 'stat': oddsratio, 'p': p_val,
                                'note': f"table={table.values.tolist()} expected<5"})
            else:
                results.append({'variable': col, 'test': 'Chi-square (2x2)', 'stat': chi2, 'p': p_chi,
                                'note': f"table={table.values.tolist()}"})
        else:
            chi2, p_val, dof, expected = stats.chi2_contingency(table.values, correction=False)
            results.append({'variable': col, 'test': f'Chi-square ({table.shape[1]} cats)', 'stat': chi2, 'p': p_val,
                            'note': f"table_shape={table.shape}"})
    else:
        a = pd.to_numeric(a, errors='coerce').dropna()
        b = pd.to_numeric(b, errors='coerce').dropna()
        if len(a) == 0 or len(b) == 0:
            results.append({'variable': col, 'test': 'NA', 'stat': np.nan, 'p': np.nan,
                            'note': 'no numeric data after coercion'})
            continue
        lev_stat, lev_p = stats.levene(a, b, center='median')
        equal_var = bool(lev_p >= 0.05)
        t_stat, p_val = stats.ttest_ind(a, b, equal_var=equal_var)
        results.append({
            'variable': col,
            'test': f"t-test ({'pooled' if equal_var else 'Welch'})",
            'stat': t_stat, 'p': p_val,
            'note': f"Levene p={lev_p:.3f} -> equal_var={equal_var}"
        })

res_df = pd.DataFrame(results)
res_df['p_adj'] = res_df['p']  
res_df['reject_H0'] = res_df['p'] < 0.05

res_df = res_df[['variable', 'test', 'stat', 'p', 'reject_H0', 'note']] \
         .sort_values(by=['p'], na_position='last')

print(res_df.to_string(index=False))

# 저장
out_path = "ttest_baseline_results.csv"
res_df.to_csv(out_path, index=False)
print(f"\nSaved: {out_path}")

             variable             test      stat            p  reject_H0                             note
              ALLERGY Chi-square (2x2) 33.195969 8.332310e-09       True      table=[[34, 5], [103, 167]]
    ALCOHOL CONSUMING Chi-square (2x2) 25.724614 3.937726e-07       True      table=[[32, 7], [105, 165]]
SWALLOWING DIFFICULTY Chi-square (2x2) 20.845000 4.979905e-06       True      table=[[34, 5], [130, 140]]
             WHEEZING Chi-square (2x2) 19.204495 1.174365e-05       True      table=[[30, 9], [107, 163]]
             COUGHING Chi-square (2x2) 19.092209 1.245523e-05       True     table=[[29, 10], [101, 169]]
           CHEST PAIN Chi-square (2x2) 11.207883 8.145057e-04       True     table=[[27, 12], [110, 160]]
        PEER_PRESSURE Chi-square (2x2) 10.734768 1.051412e-03       True     table=[[29, 10], [125, 145]]
       YELLOW_FINGERS Chi-square (2x2) 10.161100 1.434350e-03       True     table=[[26, 13], [107, 163]]
              FATIGUE Chi-square (2x2)  7.0150

In [5]:
import numpy as np
import pandas as pd
from scipy import stats

path = "survey lung cancer.csv"  
df = pd.read_csv(path)

df.columns = [c.strip() for c in df.columns]
for c in df.columns:
    if df[c].dtype == "O":
        df[c] = df[c].astype(str).str.strip()

group_var = "LUNG_CANCER"  # or 'GENDER' 
df[group_var] = df[group_var].str.upper().str.replace(" ", "")
df[group_var] = df[group_var].replace({"Y": "YES", "N": "NO", "1": "YES", "0": "NO"})

g1, g2 = sorted(df[group_var].dropna().unique())

features = [
    'GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
    'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
    'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
    'SWALLOWING DIFFICULTY', 'CHEST PAIN'
]
features = [f for f in features if f in df.columns and f != group_var]

def is_small_discrete(series: pd.Series) -> bool:
    s = series.dropna().unique()
    if len(s) <= 10:
        if pd.api.types.is_integer_dtype(series) or pd.api.types.is_bool_dtype(series) or series.dtype == "O":
            return True
        if pd.api.types.is_float_dtype(series):
            if np.all(np.equal(np.mod(s, 1), 0)): 
                return True
    return False

results = []

for col in features:
    a = df.loc[df[group_var] == g1, col].dropna()
    b = df.loc[df[group_var] == g2, col].dropna()
    if len(a) == 0 or len(b) == 0:
        results.append({'variable': col, 'test': 'NA', 'stat': np.nan, 'p': np.nan, 'note': 'one group empty'})
        continue

    treat_as_categorical = (col != "AGE") and is_small_discrete(pd.concat([a, b]))

    if treat_as_categorical:
        cats = sorted(pd.unique(pd.concat([a, b])))
        table = pd.crosstab(df[group_var], df[col], dropna=False).reindex(index=[g1, g2], columns=cats, fill_value=0)

        if table.shape == (2, 2):
            chi2, p_chi, dof, expected = stats.chi2_contingency(table.values, correction=False)
            if (expected < 5).any():
                oddsratio, p_val = stats.fisher_exact(table.values)
                results.append({'variable': col, 'test': 'Fisher exact (2x2)', 'stat': oddsratio, 'p': p_val})
            else:
                results.append({'variable': col, 'test': 'Chi-square (2x2)', 'stat': chi2, 'p': p_chi})
        else:
            chi2, p_val, dof, expected = stats.chi2_contingency(table.values, correction=False)
            results.append({'variable': col, 'test': f'Chi-square ({table.shape[1]} cats)', 'stat': chi2, 'p': p_val})
    else:
        a = pd.to_numeric(a, errors='coerce').dropna()
        b = pd.to_numeric(b, errors='coerce').dropna()
        lev_stat, lev_p = stats.levene(a, b, center='median')
        equal_var = bool(lev_p >= 0.05)
        t_stat, p_val = stats.ttest_ind(a, b, equal_var=equal_var)
        results.append({'variable': col, 'test': f"t-test ({'pooled' if equal_var else 'Welch'})",
                        'stat': t_stat, 'p': p_val})

res_df = pd.DataFrame(results).sort_values(by="p")
print(res_df.to_string(index=False))

# 저장
out_path = "t test_results.csv"
res_df.to_csv(out_path, index=False)
print(f"\nSaved: {out_path}")

             variable             test      stat            p
              ALLERGY Chi-square (2x2) 33.195969 8.332310e-09
    ALCOHOL CONSUMING Chi-square (2x2) 25.724614 3.937726e-07
SWALLOWING DIFFICULTY Chi-square (2x2) 20.845000 4.979905e-06
             WHEEZING Chi-square (2x2) 19.204495 1.174365e-05
             COUGHING Chi-square (2x2) 19.092209 1.245523e-05
           CHEST PAIN Chi-square (2x2) 11.207883 8.145057e-04
        PEER_PRESSURE Chi-square (2x2) 10.734768 1.051412e-03
       YELLOW_FINGERS Chi-square (2x2) 10.161100 1.434350e-03
              FATIGUE Chi-square (2x2)  7.015023 8.082859e-03
              ANXIETY Chi-square (2x2)  6.491988 1.083617e-02
      CHRONIC DISEASE Chi-square (2x2)  3.799722 5.126109e-02
                  AGE  t-test (pooled) -1.573857 1.165504e-01
               GENDER Chi-square (2x2)  1.397645 2.371182e-01
  SHORTNESS OF BREATH Chi-square (2x2)  1.139950 2.856628e-01
              SMOKING Chi-square (2x2)  1.045898 3.064537e-01

Saved: 

	차이가 없는 변수들 (p ≥ 0.05)
		•	AGE (p = 0.116) → 평균 나이에 유의한 차이 없음
		•	GENDER (p = 0.237) → 성별 분포 차이 없음
		•	SHORTNESS OF BREATH (p = 0.286) → 호흡곤란 차이 없음
		•	SMOKING (p = 0.306) → 흡연 여부 차이 없음

	유의한 차이가 있는 변수들 (p < 0.05)
		•	ALLERGY (p ≈ 8.3e-09) 
		•	ALCOHOL CONSUMING (p ≈ 3.9e-07) 
		•	SWALLOWING DIFFICULTY (p ≈ 4.9e-06) 
		•	WHEEZING (p ≈ 1.2e-05) 
		•	COUGHING (p ≈ 1.2e-05)
		•	CHEST PAIN (p ≈ 8.1e-04) 
		•	PEER_PRESSURE (p ≈ 0.0010) 
		•	YELLOW_FINGERS (p ≈ 0.0014) 
		•	FATIGUE (p ≈ 0.0081) 
		•	ANXIETY (p ≈ 0.0108) 