In [1]:
### import libraries ###

import numpy as np
import pandas as pd
from scipy.stats import kruskal, pearsonr, chi2_contingency, kruskal, pearsonr, chi2_contingency
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer

# from sklearn.preprocessing import 
# from scipy.stats import 
# from sklearn.preprocessing import KBinsDiscretizer


# from sklearn.preprocessing import KBinsDiscretizer
# from scipy.stats import chi2_contingency
# from scipy.stats import pearsonr

In [2]:
### Mutual Information Technique ###
def mutual_information(X, y):
    mi = mutual_info_classif(X, y, random_state=42)
    result = pd.DataFrame({"Feature": X.columns, "MI_Score": mi})
    result = result.sort_values("MI_Score", ascending=False).reset_index(drop=True)
    result["Rank"] = result.index + 1
    return result

In [3]:
### Kruskal Wallis Technique ###
def kruskal_wallis(X, y):
    if isinstance(y, pd.DataFrame):
        y = y.squeeze()
    unique_classes = np.unique(y)
    rows = []
    for feature in X.columns:
        groups = [X[feature][y == c] for c in unique_classes]
        H, p = kruskal(*groups, nan_policy="omit")
        rows.append([feature, H, p])
    result = pd.DataFrame(rows, columns=["Feature", "H_statistic", "p_value"])
    result = result.sort_values("p_value", ascending=True).reset_index(drop=True)
    result["Rank"] = result.index + 1
    return result

In [4]:
### Chi sqaured Technique ###
def chi_squared(X, y, n_bins=10):
    rows = []
    if isinstance(y, pd.DataFrame):
        y_numeric = pd.to_numeric(y.iloc[:, 0], errors='coerce')
    else:
        y_numeric = pd.to_numeric(np.ravel(y), errors='coerce')
    valid_idx = ~np.isnan(y_numeric)
    y_numeric = y_numeric[valid_idx]
    X_valid = X.loc[valid_idx].copy()
    kb = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    X_binned = kb.fit_transform(X_valid.apply(pd.to_numeric, errors='coerce'))
    for i, feature in enumerate(X_valid.columns):
        # Remove NaNs from this feature
        mask = ~np.isnan(X_binned[:, i])
        if mask.sum() == 0:
            rows.append([feature, np.nan, np.nan])
            continue
        table = pd.crosstab(X_binned[mask, i], y_numeric[mask])
        chi2, p, _, _ = chi2_contingency(table)
        rows.append([feature, chi2, p])
    chi_df = pd.DataFrame(rows, columns=['Feature', 'Chi2_stat', 'p_value'])
    chi_df = chi_df.sort_values(by='p_value', ascending=True)
    return chi_df

In [5]:
### Pearson'r Technique ###
def pearson_r(X, y):
    rows = []
    if isinstance(y, pd.DataFrame):
        y_numeric = pd.to_numeric(y.iloc[:, 0], errors='coerce')
    else:
        y_numeric = pd.to_numeric(np.ravel(y), errors='coerce')

    for feature in X.columns:
        X_numeric = pd.to_numeric(X[feature], errors='coerce')
        mask = ~np.isnan(X_numeric) & ~np.isnan(y_numeric)
        if mask.sum() == 0:
            rows.append([feature, np.nan, np.nan])
            continue 
        corr, p = pearsonr(X_numeric[mask], y_numeric[mask])
        rows.append([feature, abs(corr), p])
    pearson_df = pd.DataFrame(rows, columns=['Feature', 'Correlation', 'p_value'])
    pearson_df = pearson_df.sort_values(by='p_value', ascending=True)
    return pearson_df

In [6]:
df =pd.read_excel(r'C:\Users\HalderK\OneDrive - SUNY Upstate Medical University\Documents\Journal-1 (July 2024)\March 2025\Data Set\Final File\March12/CWU_216PT.xlsx')
X=df.drop(columns=['2-yr'])
y=df.loc[:, ['2-yr']]

In [None]:
### Load Dataset ###
DF_internal = pd.read_excel("Input the path of Internal Dataset")

X=DF_internal.iloc[:,0:-1]
y=DF_internal.iloc[:,-1]

In [7]:
print("*** Mutual Information ***")
print(mutual_information(X, y))

*** Mutual Information ***
                                       Feature  MI_Score  Rank
0                           Mean lung dose BED  0.052968     1
1                            Total lung volume  0.026826     2
2                             Conformity index  0.016054     3
3   Pack-year smoking at the time of diagnosis  0.014737     4
4                                          PTV  0.012783     5
5                          Min dose to GTV BED  0.012450     6
6                                    Histology  0.008799     7
7                            Homogeneity index  0.004564     8
8                          Min dose to PTV BED  0.003029     9
9                                     Location  0.002417    10
10                          Performance status  0.000949    11
11                                         Sex  0.000000    12
12                                         Age  0.000000    13
13                                     T-stage  0.000000    14
14                          

  y = column_or_1d(y, warn=True)


In [8]:
print("*** Kruskal Wallis ***")
print(kruskal_wallis(X, y))

*** Kruskal Wallis ***
                                       Feature  H_statistic   p_value  Rank
0                           Mean lung dose BED     9.047983  0.002630     1
1                                          PTV     8.980827  0.002728     2
2                                    Histology     5.136624  0.023426     3
3                                         aCCI     4.622652  0.031552     4
4                                          GTV     4.566376  0.032605     5
5                           Performance status     3.719819  0.053770     6
6                                      T-stage     3.484965  0.061929     7
7                                  PET Max SUV     3.424112  0.064251     8
8                             Conformity index     2.037451  0.153467     9
9                          Min dose to PTV BED     1.843743  0.174513    10
10                              Total dose BED     1.555778  0.212284    11
11                           Dose per fraction     1.325634  0.24

In [9]:
print("*** Chi squared ***")
print(chi_squared(X, y))

*** Chi squared ***
                                       Feature  Chi2_stat   p_value
6                                          PTV  31.468737  0.000246
15                          Mean lung dose BED  20.640908  0.000373
4                            Total lung volume  20.675475  0.014172
5                                          GTV  19.995807  0.017938
8                          Min dose to GTV BED  15.493339  0.050233
20                                   Histology   6.496086  0.089817
19                                     T-stage   9.351666  0.154745
22                          Performance status   6.275949  0.179466
17                                        aCCI  12.202550  0.202130
10                         Min dose to PTV BED  11.331915  0.253642
18                                 PET Max SUV   8.639730  0.373593
1                                     Location   1.319879  0.516883
0                                          Sex   0.352286  0.552822
3                           



In [10]:
print("*** Pearson Correlation ***")
print(pearson_r(X, y))

*** Pearson Correlation ***
                                       Feature  Correlation   p_value
6                                          PTV     0.274975  0.000042
5                                          GTV     0.232741  0.000564
15                          Mean lung dose BED     0.173007  0.010860
17                                        aCCI     0.158399  0.019849
20                                   Histology     0.134899  0.047685
10                         Min dose to PTV BED     0.126093  0.064339
22                          Performance status     0.123991  0.068952
19                                     T-stage     0.123167  0.070831
2                               Total dose BED     0.097750  0.152223
3                            Dose per fraction     0.093469  0.171083
8                          Min dose to GTV BED     0.092611  0.175060
18                                 PET Max SUV     0.085047  0.213157
9                          Max dose to PTV BED     0.080062  0