In [29]:
import pyreadstat
import pandas as pd
from lifelines import KaplanMeierFitter
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from psmpy import PsmPy
import numpy as np

df = pd.read_csv("2nd Preprocessing.csv")

In [30]:
df["Smoking"] = df["Smoking"].astype(int)

In [31]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 77 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          285 non-null    int64  
 1   Sex                         285 non-null    int64  
 2   Age                         285 non-null    int64  
 3   Birth Date                  285 non-null    object 
 4   Steatosis Score             285 non-null    float64
 5   NASH                        285 non-null    object 
 6   Fibrosis Stage              285 non-null    object 
 7   NAFLD Type                  285 non-null    object 
 8   Death                       285 non-null    int64  
 9   Death Date                  10 non-null     object 
 10  Weight                      285 non-null    float64
 11  Height                      285 non-null    float64
 12  Waist to Height             285 non-null    float64
 13  Waist                       285 non

In [32]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import mannwhitneyu, chi2_contingency, fisher_exact

def create_baseline_comparison(matched_control, matched_treated):
    
    # Define continuous and categorical variables
    continuous_vars = [
        'Age', 'Weight', 'Height', 'Waist to Height', 'Waist', 'BMI',
        'VFI', 'SFI', 'TFI', 'SMI','VFA (Volume)', 'SFA (Volume)', 'Spleen (Volume)', 
        'VFA (Area)', 'SFA (Area)', 'SMA (Area)', 
        'VFA (Attenuation)', 'SFA (Attenuation)', 'SMA (Attenuation)', 
        'Liver/Spleen Volume', 'Liver/Spleen HU', 'Liver (HU)', 'Spleen (HU)', 'Liver (PDFF)', 
        'LSM', 'CAP', 'AST', 'ALT', 'T.bil', 'PLT', 'PT INR', 
        'Alb', 'Glucose', 'HbA1c', 'eGFR', 'T.chol', 'HDL', 'LDL', 'TG', 'SBP', 'DBP', 'FIB-4'
    ]

    # Categorical variables (labels, categories, or binary indicators)
    categorical_vars = [
        'Sex',
         'Smoking', 'Death',
        '당뇨병/당뇨병 전단계 유무', '고혈압유무', '이상지질혈증 유무', 
        'Ischemic Heart Diseae 유무', 'Cerebrovascualr disease 유무', 
        'Nephropathy 유무'
    ]

    # Function to format continuous variables
    def format_continuous(series):
        median = series.median()
        q1, q3 = series.quantile([0.25, 0.75])
        mean = series.mean()
        std = series.std()
        return f'''{median:.1f} ({q1:.1f}-{q3:.1f})\n{mean:.1f} ({std:.1f})'''

    # Function to format categorical variables
    def format_categorical(series):
        count = series.value_counts()
        percentage = series.value_counts(normalize=True).mul(100).round(1)
        return [f"{count[idx]} ({percentage[idx]})" for idx in count.index]

    # Format continuous variables for control group
    cn_continuous = matched_control[continuous_vars].apply(format_continuous)
    cn_continuous_df = pd.DataFrame({'Variable': cn_continuous.index, 'CN': cn_continuous.values})

    # Format categorical variables for control group
    cn_categorical_list = []
    for var in categorical_vars:
        for label, formatted_value in zip(matched_control[var].dropna().value_counts().index, format_categorical(matched_control[var])):
            cn_categorical_list.append({'Variable': f"{var} ({label})", 'CN': formatted_value})

    cn_categorical_df = pd.DataFrame(cn_categorical_list)

    # Format continuous variables for treated group
    ic_continuous = matched_treated[continuous_vars].apply(format_continuous)
    ic_continuous_df = pd.DataFrame({'Variable': ic_continuous.index, 'IC': ic_continuous.values})

    # Format categorical variables for treated group
    ic_categorical_list = []
    for var in categorical_vars:
        for label, formatted_value in zip(matched_treated[var].dropna().value_counts().index, format_categorical(matched_treated[var])):
            ic_categorical_list.append({'Variable': f"{var} ({label})", 'IC': formatted_value})

    ic_categorical_df = pd.DataFrame(ic_categorical_list)

    # Combine continuous and categorical for control & treated groups
    cn_final_table = pd.concat([cn_continuous_df, cn_categorical_df], ignore_index=True)
    ic_final_table = pd.concat([ic_continuous_df, ic_categorical_df], ignore_index=True)

    # Merge control & treated dataframes
    combined_df = ic_final_table.merge(cn_final_table, on='Variable', how='outer').reset_index()

    # Function to calculate p-values and SMD
    def calculate_statistics(variable, var_type):
        smd = None
        p_value = None
        if var_type == 'continuous':
            ic_values = matched_treated[variable].dropna()
            cn_values = matched_control[variable].dropna()
            
            if len(ic_values) == 0 or len(cn_values) == 0:
                return None, None

            # Mann-Whitney U Test for non-parametric continuous data
            _, p_value = mannwhitneyu(ic_values, cn_values, alternative='two-sided')

            # Calculate Standardized Mean Difference (SMD)
            mean_ic, std_ic = ic_values.mean(), ic_values.std()
            mean_cn, std_cn = cn_values.mean(), cn_values.std()
            smd = (mean_ic - mean_cn) / np.sqrt((std_ic ** 2 + std_cn ** 2) / 2)

        elif var_type == 'categorical':
            if ' (' in variable:
                var_name, category = variable.split(' (')
                category = category.rstrip(')')
                category = int(category) if category.isdigit() else category
                
                ic_count = (matched_treated[var_name] == category).sum()
                cn_count = (matched_control[var_name] == category).sum()
                ic_total = matched_treated.shape[0]
                cn_total = matched_control.shape[0]

                contingency_table = [[ic_count, cn_count], [ic_total - ic_count, cn_total - cn_count]]

                if np.any(np.array(contingency_table) == 0):
                    _, p_value = fisher_exact(contingency_table, alternative='two-sided')
                else:
                    _, p_value, _, _ = chi2_contingency(contingency_table)

                ic_ratio = ic_count / ic_total
                cn_ratio = cn_count / cn_total
                smd = np.sqrt(((ic_ratio - cn_ratio) ** 2) / (ic_ratio * (1 - ic_ratio) + cn_ratio * (1 - cn_ratio)))

        return f"{p_value:.3f}" if p_value is not None else None, f"{smd:.3f}" if smd is not None else None

    # Calculate p-values and SMD for each row in combined_df
    for index, row in combined_df.iterrows():
        variable = row['Variable']
        if variable in continuous_vars:
            p_value, smd = calculate_statistics(variable, 'continuous')
            combined_df.at[index, 'p-value'] = p_value
            combined_df.at[index, 'SMD'] = smd
        else:
            if any(variable.startswith(cat) for cat in categorical_vars):
                p_value, smd = calculate_statistics(variable, 'categorical')
                combined_df.at[index, 'p-value'] = p_value
                combined_df.at[index, 'SMD'] = smd

    return combined_df

In [33]:
# ✅ Choose subgroup here
subgroup = ["None", "NAFL", "Probable NASH", "NASH", "Cirrhosis"]  # ← You can change this

# Step 1: Filter the data where NASH == 1
df = df[df["NAFLD Type"].isin(subgroup)]

In [34]:
comparison_results =  create_baseline_comparison(df, df)

In [35]:
pd.set_option('display.max_rows', None)
comparison_results.to_csv("baseline_comparison.csv", index=False)

comparison_results

Unnamed: 0,index,Variable,IC,CN,p-value,SMD
0,0,Age,56.0 (45.0-67.0)\n55.9 (14.8),56.0 (45.0-67.0)\n55.9 (14.8),1.0,0.0
1,1,Weight,72.0 (63.2-81.7)\n74.7 (17.2),72.0 (63.2-81.7)\n74.7 (17.2),1.0,0.0
2,2,Height,165.0 (158.0-173.0)\n165.4 (9.5),165.0 (158.0-173.0)\n165.4 (9.5),1.0,0.0
3,3,Waist to Height,5.5 (5.1-6.0)\n5.6 (0.7),5.5 (5.1-6.0)\n5.6 (0.7),1.0,0.0
4,4,Waist,910.2 (841.9-973.4)\n922.3 (120.4),910.2 (841.9-973.4)\n922.3 (120.4),1.0,0.0
5,5,BMI,26.4 (24.0-29.5)\n27.1 (4.7),26.4 (24.0-29.5)\n27.1 (4.7),1.0,0.0
6,6,VFI,375.6 (261.2-535.8)\n409.6 (218.1),375.6 (261.2-535.8)\n409.6 (218.1),1.0,0.0
7,7,SFI,515.1 (371.5-753.1)\n628.4 (426.9),515.1 (371.5-753.1)\n628.4 (426.9),1.0,0.0
8,8,TFI,890.1 (687.7-1302.8)\n1038.0 (590.8),890.1 (687.7-1302.8)\n1038.0 (590.8),1.0,0.0
9,9,SMI,48.0 (42.7-56.0)\n49.0 (9.1),48.0 (42.7-56.0)\n49.0 (9.1),1.0,0.0


In [25]:
# Divide into two groups based on SGLT2inh
control_group = df[df['Fibrosis'].isin([0, 1, 2])].copy()
treated_group = df[df['Fibrosis'].isin([3, 4])].copy()

control_group = control_group.reset_index(drop = True)
treated_group = treated_group.reset_index(drop = True)

comparison_results =  create_baseline_comparison(control_group, treated_group)

In [28]:
pd.set_option('display.max_rows', None)
comparison_results.to_csv("baseline_comparison_Fibrosis.csv", index=False)

comparison_results

Unnamed: 0,index,Variable,IC,CN,p-value,SMD
0,0,Age,64.0 (48.8-70.2)\n60.7 (12.9),54.0 (44.0-65.0)\n54.3 (15.1),0.002,0.459
1,1,Weight,67.2 (59.5-75.3)\n69.1 (13.9),73.8 (64.3-82.4)\n76.5 (17.8),0.004,-0.458
2,2,Height,160.0 (153.7-167.2)\n161.5 (8.9),167.0 (160.0-174.0)\n166.6 (9.4),0.0,-0.559
3,3,Waist to Height,5.6 (5.3-6.0)\n5.6 (0.6),5.4 (5.1-6.0)\n5.6 (0.7),0.189,0.057
4,4,Waist,910.8 (838.1-963.4)\n905.2 (96.6),910.2 (844.9-983.8)\n927.9 (126.8),0.63,-0.201
5,5,BMI,26.3 (23.8-29.0)\n26.4 (3.9),26.4 (24.0-29.5)\n27.4 (4.9),0.589,-0.22
6,6,VFI,354.6 (188.8-545.6)\n371.3 (204.8),381.2 (264.7-530.7)\n422.0 (221.4),0.193,-0.237
7,7,SFI,500.3 (306.7-720.5)\n551.1 (289.2),517.3 (384.9-754.0)\n653.5 (460.7),0.39,-0.266
8,8,TFI,901.3 (622.4-1267.1)\n922.4 (461.3),890.1 (703.1-1306.1)\n1075.4 (623.6),0.173,-0.279
9,9,SMI,45.7 (41.0-51.3)\n47.2 (8.3),48.8 (43.0-56.8)\n49.6 (9.3),0.049,-0.28


# NAFLD Type, Fibrosis (0~4)

# Steatosis Score (0~3)

# BMI_C, FIB-4_C, LSM_C

# LRE 들은 여러 개 있기에 따로 해야됨

In [45]:
from scipy.stats import chi2_contingency

df1 = treated_group
df2 = control_group
# Get all unique NAFLD Type values from both dataframes
all_types = pd.Index(df1['LRE'].dropna().unique()).union(df2['LRE'].dropna().unique()).sort_values()

# Count values for each NAFLD Type in both dataframes
df1_counts = df1['LRE'].value_counts().reindex(all_types, fill_value=0)
df2_counts = df2['LRE'].value_counts().reindex(all_types, fill_value=0)

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'NAFLD Type': all_types,
    'DF1 Count': df1_counts.values,
    'DF2 Count': df2_counts.values
})

# Calculate percentages
comparison_df['DF1 %'] = (comparison_df['DF1 Count'] / df1_counts.sum() * 100).round(1).astype(str) + '%'
comparison_df['DF2 %'] = (comparison_df['DF2 Count'] / df2_counts.sum() * 100).round(1).astype(str) + '%'

# Chi-square test
contingency = comparison_df[['DF1 Count', 'DF2 Count']].T.values
chi2, p, _, _ = chi2_contingency(contingency)

# Display results
print(comparison_df)
print(f"\nChi-square test p-value: {p:.3f}")


   NAFLD Type  DF1 Count  DF2 Count  DF1 %  DF2 %
0           0         44        160  78.6%  92.5%
1           1          6          1  10.7%   0.6%
2           3          6         12  10.7%   6.9%

Chi-square test p-value: 0.000


In [43]:
df["LRE"]

1      0
2      0
3      0
4      0
5      0
6      0
7      0
9      0
11     0
12     0
14     0
15     0
16     3
17     0
19     3
20     0
21     0
22     0
23     1
24     0
25     0
26     0
29     0
31     0
32     0
33     3
35     0
36     0
37     0
41     0
42     0
43     0
44     3
45     0
46     3
48     0
49     0
50     1
52     0
53     0
54     0
55     3
56     0
57     0
58     0
59     0
60     0
62     0
65     0
67     0
69     0
70     0
71     0
72     0
73     3
74     0
75     0
77     0
80     0
81     0
86     0
89     0
90     0
91     0
92     0
93     0
94     0
95     0
96     0
97     0
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
107    0
108    0
109    0
110    0
112    0
113    0
114    3
116    0
118    3
119    0
120    0
121    0
123    0
124    3
125    0
126    0
127    0
128    0
129    0
130    0
131    1
132    0
133    0
134    1
135    0
136    0
137    0
138    3
139    0
140    0
141    0
142    0
145    0
1