In [1]:
import os
os.getcwd()

'/home/jovyan/work/ICSME2021/4-StatisticalTests/1-CodeSample'

In [5]:
os.chdir("/home/jovyan/work/ICSME2021")

In [6]:
import pandas as pd

## Loading metrics

In [7]:
androidListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/androidSamples.csv", index_col=0, skiprows=0)
awsListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/awsSamples.csv", index_col=0, skiprows=0)
azureListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/azureSamples.csv", index_col=0, skiprows=0)
springListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/springSamples.csv", index_col=0, skiprows=0)

In [8]:
def loadMetrics(listSamples):
    metrics = dict()
    for sample in listSamples.index.values:
        metrics[sample] = pd.read_csv(f"3-DataMerge/1-CodeSampleAgregation/{sample}.csv", index_col=[0,1], na_filter=False)
    return metrics

In [9]:
androidMetrics = loadMetrics(androidListSamples)
awsMetrics = loadMetrics(awsListSamples)
azureMetrics = loadMetrics(azureListSamples)
springMetrics = loadMetrics(springListSamples)

## Mann Kendall Test

Null Hypothesis: There is no monotonic trend.

Alternative Hypothesis: There is a trend.

If p-value < 0.05 then null Hypothesis is rejected.

In [10]:
pip install pymannkendall

Note: you may need to restart the kernel to use updated packages.


In [11]:
import pymannkendall as mk

In [12]:
def extractMannKendallTest(metrics, listSamples):
    result = pd.DataFrame(index=listSamples.index)
    for sample in listSamples.index.values:
        sampleMetric = metrics[sample]
        for metric in sampleMetric.columns.values:
            mkResult = mk.original_test(sampleMetric[metric])
            result.loc[sample, metric+" has a trend?"] = mkResult.h
            result.loc[sample, metric+" trend is "] = mkResult.trend
            result.loc[sample, metric+" trend pvalue "] = mkResult.p
    return result

In [13]:
androidMannKendallResult = extractMannKendallTest(androidMetrics, androidListSamples)
awsMannKendallResult = extractMannKendallTest(awsMetrics, awsListSamples)
azureMannKendallResult = extractMannKendallTest(azureMetrics, azureListSamples)
springMannKendallResult = extractMannKendallTest(springMetrics, springListSamples)

In [14]:
androidMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/android_mann_kendall_results.csv")
awsMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/aws_mann_kendall_results.csv")
azureMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/azure_mann_kendall_results.csv")
springMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/spring_mann_kendall_results.csv")

## Normality Test (Shapiro-Wilk)

Null Hyphotesis: The population has normal distribution

Alternative Hyphotesis: The population has not normal distribution

If p-value < 0.05 then null hyphotesis is rejected

## Correlation Test

Null Hypothesis: there is no correlation between data

Alternative Hypothesis: there is a correlation

If p-value < 0.05 then null Hypothesis is rejected.

If the metric is normalized we will apply the Pearson Test, otherwise we will apply Spearman Test

In [15]:
from scipy import stats

In [16]:
def extractCorrelationTest(metrics, listSamples):
    result = pd.DataFrame(index=listSamples.index)
    for sample in listSamples.index.values:
        sampleMetric = metrics[sample]
        for metric in sampleMetric.columns.values:
            #test if the set has normal distribution
            shapiroPvalue = stats.shapiro(sampleMetric[metric]).pvalue
            if(shapiroPvalue < 0.05):
                correlationResult = stats.spearmanr(sampleMetric["questions"], sampleMetric[metric], nan_policy="omit")    
            else:
                correlationResult = stats.pearsonr(sampleMetric["questions"], sampleMetric[metric])
            result.loc[sample, metric+" correlation"] = correlationResult[0]
            result.loc[sample, metric+" correlation pvalue"] = correlationResult[1]
    return result

In [17]:
androidCorrelationResult = extractCorrelationTest(androidMetrics, androidListSamples)
awsCorrelationResult = extractCorrelationTest(awsMetrics, awsListSamples)
azureCorrelationResult = extractCorrelationTest(azureMetrics, azureListSamples)
springCorrelationResult = extractCorrelationTest(springMetrics, springListSamples)



In [18]:
androidCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/android_correlation_results.csv")
awsCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/aws_correlation_results.csv")
azureCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/azure_correlation_results.csv")
springCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/spring_correlation_results.csv")