Global

In [1]:
import numpy as np
from scipy.stats import entropy
import pandas as pd
from scipy.stats import ks_2samp
from scipy.spatial.distance import cosine
from statsmodels.stats.proportion import proportions_ztest

# KL

In [2]:
# Function to calculate KL divergence between two distributions
def calculate_kl_divergence(dist1, dist2):
      return entropy(dist1, dist2)

def KL(df):
  # Drop the id column for aggregation
  pattern_data = df.drop(columns=['id', 'class_name'])

  kl_results = []
  for i in range(10):
    # Split the data randomly into two groups (half and half)
    random_indices = np.random.permutation(len(pattern_data))
    mid_index = len(pattern_data) // 2
    group1_indices = random_indices[:mid_index]
    group2_indices = random_indices[mid_index:]

    group1 = pattern_data.iloc[group1_indices]
    group2 = pattern_data.iloc[group2_indices]

    # Aggregate pattern counts within each group
    group1_aggregated = group1.sum(axis=0)
    group2_aggregated = group2.sum(axis=0)
    # Normalize the aggregated pattern counts to create valid probability distributions
    group1_distribution = group1_aggregated / group1_aggregated.sum()
    group2_distribution = group2_aggregated / group2_aggregated.sum()


    # Compute KL divergence between the two group-level distributions
    kl_results.append(calculate_kl_divergence(group1_distribution, group2_distribution))

  return np.round(np.mean(kl_results), 3)

In [3]:
def KL_diversity(group1, group2):
  group1 = group1.drop(columns=['id', 'class_name'])
  group2 = group2.drop(columns=['id', 'class_name'])
  # Aggregate pattern counts within each group
  group1_aggregated = group1.sum(axis=0)
  group2_aggregated = group2.sum(axis=0)

  # Normalize the aggregated pattern counts to create valid probability distributions
  group1_distribution = group1_aggregated / group1_aggregated.sum()
  group2_distribution = group2_aggregated / group2_aggregated.sum()


  # Compute KL divergence between the two group-level distributions
  return calculate_kl_divergence(group1_distribution, group2_distribution)

# KS

In [4]:
def KS(df):
  # Drop the id column for aggregation
  pattern_data = df.drop(columns=['id', 'class_name'])

  ks_results = []
  p_val = []
  results = {'Fold': [], 'KS Statistic': [], 'P-Value': [], 'Result': []}
  for i in range(10):
    # Split the data randomly into two groups (half and half)
    random_indices = np.random.permutation(len(pattern_data))
    mid_index = len(pattern_data) // 2
    group1_indices = random_indices[:mid_index]
    group2_indices = random_indices[mid_index:]

    group1 = pattern_data.iloc[group1_indices]
    group2 = pattern_data.iloc[group2_indices]

    # Aggregate pattern counts within each group
    group1_aggregated = group1.sum(axis=0)
    group2_aggregated = group2.sum(axis=0)

    ks_statistic, p_value = ks_2samp(group1_aggregated, group2_aggregated)

    if p_value > 0.05:
      r = 'similar'
    else:
      r = 'not similar'

    results['Fold'].append(i)
    results['KS Statistic'].append(ks_statistic)
    results['P-Value'].append(p_value)
    results['Result'].append(r)

  return pd.DataFrame(results)



In [5]:
def KS_test(group1, group2):
  group1 = group1.drop(columns=['id', 'class_name'])
  group2 = group2.drop(columns=['id', 'class_name'])
  # Aggregate pattern counts within each group
  group1_aggregated = group1.sum(axis=0)
  group2_aggregated = group2.sum(axis=0)

  ks_statistic, p_value = ks_2samp(group1_aggregated, group2_aggregated)

  # Interpretation
  if p_value > 0.05:
    r = "similar"
  else:
    r = "not similar"

  result = {"KS Statistic": [ks_statistic], "P-Value": [p_value], "Result": [r]}
  return pd.DataFrame(result)


# COSINE

In [6]:
def COSINE(df):
  # Drop the id column for aggregation
  pattern_data = df.drop(columns=['id', 'class_name'])

  results = {"Fold": [], "Cosine Similarity": [], "Result": []}
  for i in range(10):
    # Split the data randomly into two groups (half and half)
    random_indices = np.random.permutation(len(pattern_data))
    mid_index = len(pattern_data) // 2
    group1_indices = random_indices[:mid_index]
    group2_indices = random_indices[mid_index:]

    group1 = pattern_data.iloc[group1_indices]
    group2 = pattern_data.iloc[group2_indices]

    # Aggregate pattern counts within each group
    group1_aggregated = group1.sum(axis=0)
    group2_aggregated = group2.sum(axis=0)
    # Normalize the aggregated pattern counts to create valid probability distributions
    group1_distribution = group1_aggregated / group1_aggregated.sum()
    group2_distribution = group2_aggregated / group2_aggregated.sum()

    cosine_similarity = 1 - cosine(group1_distribution, group2_distribution)

    # Interpretation
    if cosine_similarity > 0.95:
      r = "very similar"
    elif cosine_similarity > 0.85:
        r = "similar"
    else:
        r = "not similar"

    results["Fold"].append(i)
    results["Cosine Similarity"].append(cosine_similarity)
    results["Result"].append(r)

  return pd.DataFrame(results)

In [7]:
def Cosine_diversity(group1, group2):
  group1 = group1.drop(columns=['id', 'class_name'])
  group2 = group2.drop(columns=['id', 'class_name'])
  # Aggregate pattern counts within each group
  group1_aggregated = group1.sum(axis=0)
  group2_aggregated = group2.sum(axis=0)

  # Normalize the aggregated pattern counts to create valid probability distributions
  group1_distribution = group1_aggregated / group1_aggregated.sum()
  group2_distribution = group2_aggregated / group2_aggregated.sum()

  cosine_similarity = 1 - cosine(group1_distribution, group2_distribution)

  # Interpretation
  if cosine_similarity > 0.95:
      r = "very similar"
  elif cosine_similarity > 0.85:
      r = "similar"
  else:
      r = "not similar"

  result = {"Cosine Similarity": [cosine_similarity], "Result": [r]}
  return pd.DataFrame(result)



In [8]:
def in_groups(dfs):
  for name, df in dfs:
    kl_result = KL(df) # one number
    ks_result = KS(df) # return df
    cosine_result = COSINE(df) # return df
    print(f"KL stability {name}: {kl_result}")
    ks_result.to_csv(f"{name}_ks_stability.csv", index=False)
    cosine_result.to_csv(f"{name}_cosine_stability.csv", index=False)

In [9]:
def between_groups(groups):
  for name, group1, group2 in groups:
    print(name)
    kl_result = KL_diversity(group1, group2) # number
    ks_result = KS_test(group1, group2)
    cosine_result = Cosine_diversity(group1, group2) # return df
    print(f"KL diversity {name}: {kl_result}")
    ks_result.to_csv(f"{name}_ks_diversity.csv", index=False)
    cosine_result.to_csv(f"{name}_cosine_diversity.csv", index=False)

In [18]:
# load data
cancer_df = pd.read_csv('/content/gout_cancer_level_4_patterns_2per.csv')
cardiac_df = pd.read_csv('/content/gout_cardiac_level_4_patterns_2per.csv')
neither_df = pd.read_csv('/content/gout_neither_level_4_patterns_2per.csv')

# add label
cancer_df['class_name'] = cancer_df['class_name'].replace(True, 'cancer')
cardiac_df['class_name'] = cardiac_df['class_name'].replace(True, 'cardiac')
neither_df['class_name'] = neither_df['class_name'].replace(True, 'neither')

# fill null as 0
cancer_df.fillna(0, inplace=True)
cardiac_df.fillna(0, inplace=True)
neither_df.fillna(0, inplace=True)

combined_df = pd.concat([cancer_df, cardiac_df, neither_df], ignore_index=True)
combined_df.fillna(0, inplace=True)

numeric_cols = combined_df.select_dtypes(include=['number']).columns
numeric_cols = list(numeric_cols)[1:] # remove 'id' col

# combined_df[numeric_cols] = (combined_df[numeric_cols] - combined_df[numeric_cols].min()) / (combined_df[numeric_cols].max() - combined_df[numeric_cols].min())

for col in numeric_cols:
    min_col = combined_df[col].min()
    max_col = combined_df[col].max()
    if max_col - min_col == 0:
        combined_df[col] = 0  # or some other appropriate default value
    else:
        combined_df[col] = (combined_df[col] - min_col) / (max_col - min_col)


# Apply Laplace smoothing to numeric columns
alpha = 1e-6  # Laplace smoothing factor

combined_df[numeric_cols] = combined_df[numeric_cols].applymap(lambda x: x + alpha)

# cancer
cancer_df = combined_df[combined_df['class_name'] == 'cancer']
# cardiac
cardiac_df = combined_df[combined_df['class_name'] == 'cardiac']
# nither
neither_df = combined_df[combined_df['class_name'] == 'neither']

dfs = [("cancer", cancer_df), ("cardiac", cardiac_df), ("neither", neither_df)]
in_groups(dfs)


# between groups
# Cancer vs not cancer
not_cancer_df = combined_df[combined_df['class_name'] != 'cancer']
# Cardiac vs not Cardiac
not_cardiac_df = combined_df[combined_df['class_name'] != 'cardiac']
# Neither vs not (cancer or Cardiac)
not_neither_df = combined_df[combined_df['class_name'] != 'neither']

groups = [("cancer", cancer_df, not_cancer_df), ("cardiac", cardiac_df, not_cardiac_df), ("neither", neither_df, not_neither_df)]
between_groups(groups)

  combined_df[numeric_cols] = combined_df[numeric_cols].applymap(lambda x: x + alpha)


KL stability cancer: 0.091
KL stability cardiac: 0.024
KL stability neither: 0.078
cancer
KL diversity cancer: 5.085901463902687
cardiac
KL diversity cardiac: 2.280291510878791
neither
KL diversity neither: 2.9432702960052888


# Local


In [16]:
for name, group1, group2 in groups:
  test_results = []
  for column in numeric_cols:
      # Summing up the counts for the column in each dataframe
      count1 = group1[column].sum()
      count2 = group2[column].sum()

      # Summing up the total counts (nobs) - using the number of non-null as the observation count
      nobs1 = len(cancer_df)
      nobs2 = len(not_cancer_df)

      # Calculate the Z-test for two proportions
      stat, pval = proportions_ztest([count1, count2], [nobs1, nobs2])

      # Check if similar or not
      similar = 'similar' if pval > 0.05 else 'not similar'

      # Append results
      test_results.append((column, stat, pval, similar))

  # Convert test results to DataFrame
  results_df = pd.DataFrame(test_results, columns=['Column', 'Z-Statistic', 'P-Value', 'Similarity'])

  # Calculate the percentage of similar columns
  not_similar_count = results_df[results_df['Similarity'] == 'not similar'].shape[0]
  total_count = results_df.shape[0]
  percentage_not_similar = (not_similar_count / total_count) * 100
  print(f"{name}: Similar Percentage {100 - percentage_not_similar}")

cancer: Similar Percentage 33.33333333333334
cardiac: Similar Percentage 41.880341880341874
neither: Similar Percentage 39.31623931623932


In [17]:
results = {"name": [], "Fold": [], "similar": []}
for name, df in dfs:
  test_results = []

  for i in range(10):
    random_indices = np.random.permutation(len(df))
    mid_index = len(df) // 2
    group1_indices = random_indices[:mid_index]
    group2_indices = random_indices[mid_index:]

    group1 = df.iloc[group1_indices]
    group2 = df.iloc[group2_indices]

    for column in numeric_cols:
        # Summing up the counts for the column in each dataframe
        count1 = group1[column].sum()
        count2 = group2[column].sum()

        # Summing up the total counts (nobs) - using the number of non-null as the observation count
        nobs1 = len(cancer_df)
        nobs2 = len(not_cancer_df)

        # Calculate the Z-test for two proportions
        stat, pval = proportions_ztest([count1, count2], [nobs1, nobs2])

        # Check if similar or not
        similar = 'similar' if pval > 0.05 else 'not similar'

        # Append results
        test_results.append((column, stat, pval, similar))

    # Convert test results to DataFrame
    results_df = pd.DataFrame(test_results, columns=['Column', 'Z-Statistic', 'P-Value', 'Similarity'])

    # Calculate the percentage of similar columns
    similar_count = results_df[results_df['Similarity'] == 'similar'].shape[0]
    total_count = results_df.shape[0]
    similar = (similar_count / total_count) * 100
    results['name'].append(name)
    results['Fold'].append(i)
    results['similar'].append(similar)
results_df = pd.DataFrame(results)
results_df.to_csv("Local_stability.csv", index=False)

In [13]:
# combined_df.to_csv("Temporal_Features_Data.csv", index=False)

In [14]:
# from google.colab import drive
# drive.mount('/content/drive')