In [25]:
import pandas as pd
import numpy as np
import statistics

## Loading metrics

In [44]:
samples = pd.read_csv("../1-GettingQuestions/sampleQuestions.csv")
samples["full_name"] = samples["framework"] + "/" +samples["path"]
samples = samples["full_name"].unique()

In [45]:
def loadMetrics(listSamples):
    metrics = dict()
    for sample in listSamples:
        metrics[sample] = pd.read_csv(f"../3-DataMerge/{sample}.csv", index_col=[0,1], na_filter=False)
    return metrics

In [46]:
normalized_metrics_questions = loadMetrics(samples)

## Mann Kendall Test

Null Hypothesis: There is no monotonic trend.

Alternative Hypothesis: There is a trend.

If p-value < 0.05 then null Hypothesis is rejected.

In [47]:
pip install pymannkendall

Note: you may need to restart the kernel to use updated packages.


In [48]:
import pymannkendall as mk

In [74]:
def extractMannKendallTest(metrics, listSamples):
    result = pd.DataFrame(index=listSamples)
    result.index.rename("full_name", inplace=True)
    for sample in listSamples:
        sampleMetric = metrics[sample]
        for metric in sampleMetric.columns.values:
            mkResult = mk.original_test(sampleMetric[metric])
            result.loc[sample, metric+" trend"] = mkResult.trend
            result.loc[sample, metric+" pvalue"] = mkResult.p
            result.loc[sample, "framework"] = sample.split("/")[0]
    return result

In [75]:
mannKendallResult = extractMannKendallTest(normalized_metrics_questions, normalized_metrics_questions.keys())

In [76]:
mannKendallResult.to_csv("mannKendallTest/mann_kendall_results.csv")

## Normality Test (Shapiro-Wilk)

Null Hyphotesis: The population has normal distribution

Alternative Hyphotesis: The population has not normal distribution

If p-value < 0.05 then null hyphotesis is rejected

## Correlation Test

Null Hypothesis: there is no correlation between data

Alternative Hypothesis: there is a correlation

If p-value < 0.05 then null Hypothesis is rejected.

If the metric is normalized we will apply the Pearson Test, otherwise we will apply Spearman Test

In [77]:
from scipy import stats

In [83]:
def extractCorrelationTest(metrics, listSamples):
    result = pd.DataFrame(index=listSamples)
    result.index.rename("full_name", inplace=True)
    for sample in listSamples:
        sampleMetric = metrics[sample]
        for metric in sampleMetric.columns.values:
            #test if the set has normal distribution
            shapiroPvalue = stats.shapiro(sampleMetric[metric]).pvalue
            if(shapiroPvalue < 0.05):
                correlationResult = stats.spearmanr(sampleMetric["questions"], sampleMetric[metric], nan_policy="omit")    
            else:
                correlationResult = stats.pearsonr(sampleMetric["questions"], sampleMetric[metric])
            result.loc[sample, metric+" correlation"] = correlationResult[0]
            result.loc[sample, metric+" pvalue"] = correlationResult[1]
            result.loc[sample, "framework"] = sample.split("/")[0]

    return result

In [84]:
correlationResult = extractCorrelationTest(normalized_metrics_questions, normalized_metrics_questions.keys())

In [85]:
correlationResult.to_csv("correlationTest/correlation_results.csv")