In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset (adjust the file name/path as needed)
data = pd.read_csv('Final_data.csv')

def normality_tests(data):
    results = []
    
    for column in data.select_dtypes(include=[np.number]).columns:
        values = data[column].dropna()
        
        if len(values) > 3:  # Shapiro-Wilk requires at least 3 values
            ks_stat, ks_p = stats.kstest(values, 'norm', args=(values.mean(), values.std()))
            sw_stat, sw_p = stats.shapiro(values)
            
            normality = "Normal (p > 0.05)" if ks_p > 0.05 and sw_p > 0.05 else "Not Normal (p ≤ 0.05)"
            results.append([column, ks_p, sw_p, normality])
        else:
            results.append([column, None, None, "Insufficient Data"])
    
    results_df = pd.DataFrame(results, columns=["Variable", "KS p-value", "SW p-value", "Normality"])
    return results_df


normality_results = normality_tests(data)
print(normality_results)

def non_parametric_tests(data):
    results = []
    
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns
    
    # Mann-Whitney U Test (for two independent groups)
    for cat_col in categorical_cols:
        if data[cat_col].nunique() == 2:  # Ensure only two unique groups
            for num_col in numeric_cols:
                group1 = data[data[cat_col] == data[cat_col].unique()[0]][num_col].dropna()
                group2 = data[data[cat_col] == data[cat_col].unique()[1]][num_col].dropna()
                
                if len(group1) > 0 and len(group2) > 0:
                    mwu_stat, mwu_p = stats.mannwhitneyu(group1, group2)
                    interpretation = "Significant" if mwu_p < 0.05 else "Not Significant"
                    results.append([f"Mann-Whitney U Test ({cat_col} vs {num_col})", round(mwu_stat, 4), round(mwu_p, 4), interpretation])
    
    # Kruskal-Wallis H Test (for multiple independent groups)
    for cat_col in categorical_cols:
        if data[cat_col].nunique() > 2:  # Ensure multiple groups
            for num_col in numeric_cols:
                groups = [group[num_col].dropna().values for _, group in data.groupby(cat_col) if len(group) > 0]
                
                if len(groups) > 1:
                    kw_stat, kw_p = stats.kruskal(*groups)
                    interpretation = "Significant" if kw_p < 0.05 else "Not Significant"
                    results.append([f"Kruskal-Wallis H Test ({cat_col} vs {num_col})", round(kw_stat, 4), round(kw_p, 4), interpretation])
    
    # Spearman’s Rank Correlation (only for QoL_Score vs other numeric variables)
    if 'QoL_Score' in numeric_cols:
        for num_col in numeric_cols:
            if num_col != 'QoL_Score':
                spearman_corr, spearman_p = stats.spearmanr(data['QoL_Score'].dropna(), data[num_col].dropna())
                if abs(spearman_corr) > 0.5:
                    interpretation = "Strong Correlation"
                elif abs(spearman_corr) > 0.3:
                    interpretation = "Moderate Correlation"
                else:
                    interpretation = "Weak Correlation"
                results.append([f"Spearman’s Rank Correlation (QoL_Score vs {num_col})", round(spearman_corr, 4), round(spearman_p, 4), interpretation])
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results, columns=["Test", "Statistic", "p-value", "Interpretation"])
    return results_df

# Run the function
non_parametric_results = non_parametric_tests(data)
print(non_parametric_results)



def parametric_tests(data):
    results = []
    
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns
    
    # T-test (Independent Samples T-test)
    for cat_col in categorical_cols:
        if data[cat_col].nunique() == 2:  # Ensure only two unique groups
            for num_col in numeric_cols:
                group1 = data[data[cat_col] == data[cat_col].unique()[0]][num_col].dropna()
                group2 = data[data[cat_col] == data[cat_col].unique()[1]][num_col].dropna()
                
                if len(group1) > 0 and len(group2) > 0:
                    t_stat, t_p = stats.ttest_ind(group1, group2)
                    interpretation = "Significant" if t_p < 0.05 else "Not Significant"
                    results.append([f"T-test ({cat_col} vs {num_col})", round(t_stat, 4), round(t_p, 4), interpretation])
    
    # Chi-Square Test (for association between two categorical variables)
    for cat_col1 in categorical_cols:
        for cat_col2 in categorical_cols:
            if cat_col1 != cat_col2:
                contingency_table = pd.crosstab(data[cat_col1], data[cat_col2])
                chi2_stat, chi2_p, _, _ = stats.chi2_contingency(contingency_table)
                interpretation = "Associated" if chi2_p < 0.05 else "Not Associated"
                results.append([f"Chi-Square Test ({cat_col1} vs {cat_col2})", round(chi2_stat, 4), round(chi2_p, 4), interpretation])
    
    # Pearson Correlation (only for QoL_Score vs other numeric variables)
    if 'QoL_Score' in numeric_cols:
        for num_col in numeric_cols:
            if num_col != 'QoL_Score':
                pearson_corr, pearson_p = stats.pearsonr(data['QoL_Score'].dropna(), data[num_col].dropna())
                interpretation = "Strong" if abs(pearson_corr) > 0.5 else "Weak"
                results.append([f"Pearson Correlation (QoL_Score vs {num_col})", round(pearson_corr, 4), round(pearson_p, 4), interpretation])
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results, columns=["Test", "Statistic", "p-value", "Interpretation"])
    return results_df

parametric_results = parametric_tests(data)
print(parametric_results)

# Save each table to a separate text file
normality_results.to_csv('normality_results1.txt', index=False, sep='\t')
non_parametric_results.to_csv('non_parametric_results1.txt', index=False, sep='\t')
parametric_results.to_csv('parametric_results1.txt', index=False, sep='\t')

print("Tables have been saved to text files.")



                   Variable  KS p-value    SW p-value              Normality
0                       Age    0.772930  1.547713e-01      Normal (p > 0.05)
1               TBSA_Burned    0.000024  1.515539e-14  Not Normal (p ≤ 0.05)
2             Itch_Severity    0.000005  2.279278e-11  Not Normal (p ≤ 0.05)
3             Itch_Duration    0.000152  3.426098e-09  Not Normal (p ≤ 0.05)
4                 QoL_Score    0.048300  1.187934e-07  Not Normal (p ≤ 0.05)
5  Hospitalization_Duration    0.000042  3.925278e-09  Not Normal (p ≤ 0.05)
                                                 Test   Statistic  p-value  \
0                 Mann-Whitney U Test (Gender vs Age)   8317.0000   0.2976   
1         Mann-Whitney U Test (Gender vs TBSA_Burned)   8963.5000   0.9824   
2       Mann-Whitney U Test (Gender vs Itch_Severity)   9156.5000   0.7768   
3       Mann-Whitney U Test (Gender vs Itch_Duration)   9448.5000   0.4559   
4           Mann-Whitney U Test (Gender vs QoL_Score)   8809.0000   0.7