In [35]:
def get_current_dir():
    import os
    from pathlib import Path
    try:
        return Path(__file__).parent.absolute()
    except NameError:
        return Path(os.getcwd())

results = get_current_dir().parent / "data" / "results.csv"
results_authors = get_current_dir().parent / "data" / "results-authors.csv"

import pandas as pd
actual = pd.read_csv(results)
expected = pd.read_csv(results_authors)

# hyperparameters from paper
for alpha_ent in [0.30, 0.40]:
    for alpha_word in [0.2, 0.3, 0.4, 0.5]:
        for k in [20, 50, 100, 200]:
            expected_row = expected[(expected["alpha_ent"] == alpha_ent) & (expected["alpha_word"] == alpha_word) & (expected["k"] == k)]

            expected_score_deezer = expected_row["score_deezer"].values[0]
            expected_score_score_itunes = expected_row["score_itunes"].values[0]

            # hyperparameters guessed by me
            for n_topics in [10, 20, 50, 100]:
                for n_neighbours in [5, 10, 20, 500]:
                    actual_row = actual[(actual["alpha_ent"] == alpha_ent) & (actual["alpha_word"] == alpha_word) & (actual["k"] == k) & (actual["n_topics"] == n_topics) & (actual["n_neighbours"] == n_neighbours)]

                    actual_score_deezer = actual_row["score_deezer"].values[0]
                    actual_score_itunes = actual_row["score_itunes"].values[0]

In [44]:
import scipy.stats as stats
import numpy as np

def perform_reproducibility_tests(actual_scores, expected_scores, metric_name, confidence_level=0.95):
    t_stat, p_value = stats.ttest_ind(actual_scores, expected_scores)
    
    n_actual = len(actual_scores)
    mean_actual = np.mean(actual_scores)
    sem_actual = stats.sem(actual_scores)
    ci_actual = stats.t.interval(confidence_level, n_actual-1, loc=mean_actual, scale=sem_actual)
    
    n_expected = len(expected_scores)
    mean_expected = np.mean(expected_scores)
    sem_expected = stats.sem(expected_scores)
    ci_expected = stats.t.interval(confidence_level, n_expected-1, loc=mean_expected, scale=sem_expected)
    
    var_actual = np.var(actual_scores, ddof=1)
    var_expected = np.var(expected_scores, ddof=1)
    
    ks_stat, ks_p = stats.ks_2samp(actual_scores, expected_scores)
    
    print(f"\nResults for {metric_name}:")
    print(f"T-test: statistic={t_stat}, p-value={p_value}")
    print(f"Confidence intervals ({confidence_level*100}%):")
    print(f"\tActual: ({ci_actual[0]}, {ci_actual[1]})")
    print(f"\tExpected: ({ci_expected[0]}, {ci_expected[1]})")
    print(f"Variances:")
    print(f"\tActual: {var_actual}")
    print(f"\tExpected: {var_expected}")
    print(f"KS-test: statistic={ks_stat}, p-value={ks_p}")

actual_deezer_scores = []
actual_itunes_scores = []
expected_deezer_scores = []
expected_itunes_scores = []

for alpha_ent in [0.30, 0.40]:
    for alpha_word in [0.2, 0.3, 0.4, 0.5]:
        for k in [20, 50, 100, 200]:
            expected_row = expected[(expected["alpha_ent"] == alpha_ent) & (expected["alpha_word"] == alpha_word) & (expected["k"] == k)]
            
            if expected_row.empty or expected_row["score_deezer"].isnull().all() or expected_row["score_itunes"].isnull().all():
                continue

            expected_score_deezer = expected_row["score_deezer"].values[0]
            expected_score_itunes = expected_row["score_itunes"].values[0]
            
            for n_topics in [10, 20, 50, 100]:
                for n_neighbours in [5, 10, 20, 500]:
                    actual_row = actual[(actual["alpha_ent"] == alpha_ent) & (actual["alpha_word"] == alpha_word) & (actual["k"] == k) & (actual["n_topics"] == n_topics) & (actual["n_neighbours"] == n_neighbours)]
                    
                    if actual_row.empty or actual_row["score_deezer"].isnull().all() or actual_row["score_itunes"].isnull().all():
                        continue

                    actual_score_deezer = actual_row["score_deezer"].values[0]
                    actual_score_itunes = actual_row["score_itunes"].values[0]
                    
                    actual_deezer_scores.append(actual_score_deezer)
                    actual_itunes_scores.append(actual_score_itunes)
                    expected_deezer_scores.append(expected_score_deezer)
                    expected_itunes_scores.append(expected_score_itunes)

perform_reproducibility_tests(actual_deezer_scores, expected_deezer_scores, "Deezer Scores")
perform_reproducibility_tests(actual_itunes_scores, expected_itunes_scores, "iTunes Scores")

print("iTunes Scores Statistics:")
print(f"Actual scores count: {len(actual_itunes_scores)}")
print(f"Non-null actual scores: {sum(~np.isnan(actual_itunes_scores))}")
print(f"Sample of actual scores: {actual_itunes_scores[:5]}")


Results for Deezer Scores:
T-test: statistic=0.22831692071150542, p-value=0.8194456338261875
Confidence intervals (95.0%):
	Actual: (51.994690848108995, 52.35325787987535)
	Expected: (51.89433108979605, 52.383164017836044)
Variances:
	Actual: 4.255412686851318
	Expected: 7.909005487126354
KS-test: statistic=0.2035225048923679, p-value=1.1304443657367778e-09

Results for iTunes Scores:
T-test: statistic=14.412682128811188, p-value=5.31778508283801e-43
Confidence intervals (95.0%):
	Actual: (51.1179474752266, 51.406272094245026)
	Expected: (49.34592616094821, 49.71904448484436)
Variances:
	Actual: 2.7514677416295537
	Expected: 4.607805379686121
KS-test: statistic=0.4520547945205479, p-value=2.055166401724473e-47
iTunes Scores Statistics:
Actual scores count: 511
Non-null actual scores: 511
Sample of actual scores: [51.6682, 51.369600000000005, 51.4062, 51.432599999999994, 52.913999999999994]
