In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

def expand_with_min_value(arr, target_length):
  min_value = min(arr)
  while len(arr) < target_length:
      arr.append(min_value)
  return arr

def top_ten_values(values):
  return sorted(values, reverse=True)[:10]

def calculate_boxplot_stats(data):
  data = np.array(data)
  Q1 = np.percentile(data, 25)
  Q3 = np.percentile(data, 75)
  IQR = Q3 - Q1

  # Whiskers: usually set to Q1 - 1.5 * IQR and Q3 + 1.5 * IQR
  lower_whisker = Q1 - 1.5 * IQR
  upper_whisker = Q3 + 1.5 * IQR

  # Identifying outliers as points outside the whiskers
  outliers = data[(data < lower_whisker) | (data > upper_whisker)]

  # Adjusting whiskers to the minimum and maximum data points within the calculated range
  lower_whisker = max(min(data), lower_whisker)
  upper_whisker = min(max(data), upper_whisker)

  print("Q1:", Q1)
  print("Q3:", Q3)
  print("IQR:", IQR)
  print("Lower Whisker:", lower_whisker)
  print("Upper Whisker:", upper_whisker)
  print("Outliers:", outliers)

json_data = '''
[
  {
    "title": "case1-ORG-vs-SO",
    "group1": "20.91, 21.15, 20.75, 20.53, 20.45, 20.45, 20.45, 20.45, 20.45, 20.45",
    "group2": "29.22, 29.22, 29.21, 29.13, 29.11, 29.11, 29.08, 28.77, 28.71, 28.52"
  },
  {
    "title": "case2-ORG-Gen-vs-SO-Gen",
    "group1": "21.02, 20.96, 20.8, 20.55, 20.55, 20.55, 20.55, 20.55, 20.55, 20.55",
    "group2": "28.03, 28.02, 27.89, 27.72, 27.71, 27.66, 27.44, 27.33, 26.91, 26.5"
  },
  {
    "title": "case3-ORG-GenV2-vs-SO-GenV2",
    "group1": "20.79, 20.83, 20.87, 20.67, 19.89, 20.17, 19.89, 19.89, 19.89, 19.89",
    "group2": "28.36, 28.3, 28.26, 27.95, 23.39, 28.08, 25.85, 24.8, 25.32, 24.59"
  },
  {
    "title": "case4-ORG-L-vs-ORG-M",
    "group1": "20.86, 21.06, 21.02, 20.82, 20.4, 20.4, 20.4, 20.4, 20.4, 20.4",
    "group2": "20.64, 20.96, 20.47, 20.93, 20.88, 20.47, 20.47, 20.47, 20.47, 20.47"
  },
  {
    "title": "case5-ORG-L-vs-ORG-S",
    "group1": "20.86, 21.06, 21.02, 20.82, 20.4, 20.4, 20.4, 20.4, 20.4, 20.4",
    "group2": "20.92, 20.94, 21.1, 20.44, 20.61, 20.5, 20.44, 20.44, 20.44, 20.44"
  },
  {
    "title": "case6-ORG-M-vs-ORG-S",
    "group1": "20.64, 20.96, 20.47, 20.93, 20.88, 20.47, 20.47, 20.47, 20.47, 20.47",
    "group2": "20.92, 20.94, 21.1, 20.44, 20.61, 20.5, 20.44, 20.44, 20.44, 20.44"
  },
  {
    "title": "case7-SO-L-vs-SO-M",
    "group1": "19.84, 20.65, 19.98, 20.16, 19.81, 19.81, 19.81, 19.81, 19.81, 19.81",
    "group2": "28.48, 28.28, 28.21, 28.17, 28.05, 27.8, 27.79, 27.64, 27.16, 26.86"
  },
  {
    "title": "case8-SO-L-vs-SO-S",
    "group1": "19.84, 20.21, 20.49, 20, 19.91, 19.81, 19.81, 19.81, 19.81, 19.81",
    "group2": "28.48, 28.28, 28.21, 28.17, 28.05, 27.8, 27.79, 27.64, 27.16, 26.86"
  },
  {
    "title": "case9-SO-M-vs-SO-S",
    "group1": "19.84, 20.65, 19.98, 20.16, 19.81, 19.81, 19.81, 19.81, 19.81, 19.81",
    "group2": "19.84, 20.21, 20.49, 20, 19.91, 19.81, 19.81, 19.81, 19.81, 19.81"
  }
]
'''

data = json.loads(json_data)

for entry in data:
  title = entry["title"]
  group1 = list(map(float, entry["group1"].split(",")))
  group2 = list(map(float, entry["group2"].split(",")))

  group1 = expand_with_min_value(group1,10)
  group2 = top_ten_values(group2)

  print(group1)
  print(group2)

  statistic, p_value = stats.wilcoxon(group1, group2, correction=False)
  print("Wilcoxon signed-rank test. Statistic:", statistic)
  print("Wilcoxon signed-rank test. P-value:", p_value)

  if p_value < 0.05:
      print("[Okay] Reject the null hypothesis. There is a significant difference between the two samples.")
  else:
      print("[Not Okay] Fail to reject the null hypothesis. There is no significant difference between the two samples.")

  # calculate_boxplot_stats(group1)
  # calculate_boxplot_stats(group2)

  print('-'*100)

  plt.figure(figsize=(8, 6))
  # sns.boxplot(data=[group1, group2])
  # sns.boxplot(data=[group1, group2], palette=["white", "white"])  

  sns.boxplot(
      data=[group1, group2], 
      palette=["grey", "grey"],  # Set G1 to white and G2 to black
      linewidth=2,  # Set the overall line thickness
      flierprops=dict(marker='o', color='black', markersize=10, linestyle='none', linewidth=2),  # Outlier properties
      medianprops=dict(color='black', linewidth=3)  # Median line properties
  )

  plt.xticks([0, 1], ['G1', 'G2'], fontsize=18)
  plt.grid(True, linestyle='--', alpha=0.7)
  plt.tick_params(axis='y', labelsize=18)  
  plt.tight_layout(pad=0.1)  
  # plt.title(f'Box Plot for {title}')
  # plt.ylabel('Values')
  # plt.savefig(f"../figure/{title}.png")
  plt.savefig(f"../figure/{title}.png", bbox_inches='tight', pad_inches=0.01) 
  plt.close()

# Alternatively, you can use:
# plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1) 
# to manually set the figure's margin.




[20.91, 21.15, 20.75, 20.53, 20.45, 20.45, 20.45, 20.45, 20.45, 20.45]
[29.22, 29.22, 29.21, 29.13, 29.11, 29.11, 29.08, 28.77, 28.71, 28.52]
Wilcoxon signed-rank test. Statistic: 0.0
Wilcoxon signed-rank test. P-value: 0.001953125
[Okay] Reject the null hypothesis. There is a significant difference between the two samples.
----------------------------------------------------------------------------------------------------
[21.02, 20.96, 20.8, 20.55, 20.55, 20.55, 20.55, 20.55, 20.55, 20.55]
[28.03, 28.02, 27.89, 27.72, 27.71, 27.66, 27.44, 27.33, 26.91, 26.5]
Wilcoxon signed-rank test. Statistic: 0.0
Wilcoxon signed-rank test. P-value: 0.001953125
[Okay] Reject the null hypothesis. There is a significant difference between the two samples.
----------------------------------------------------------------------------------------------------
[20.79, 20.83, 20.87, 20.67, 19.89, 20.17, 19.89, 19.89, 19.89, 19.89]
[28.36, 28.3, 28.26, 28.08, 27.95, 25.85, 25.32, 24.8, 24.59, 23.39]
Wilcoxon

