In [91]:
import os
os.getcwd()

'/home/jovyan/work/SANER2021'

In [92]:
os.chdir("/home/jovyan/work/SANER2021")

In [93]:
import pandas as pd

## Loading metrics

In [94]:
androidListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/androidSamples.csv", index_col=0, skiprows=0)
awsListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/awsSamples.csv", index_col=0, skiprows=0)
azureListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/azureSamples.csv", index_col=0, skiprows=0)
springListSamples = pd.read_csv("1-GettingQuestions/samplesWithQuestions/springSamples.csv", index_col=0, skiprows=0)

In [95]:
def loadMetrics(listSamples):
    metrics = dict()
    for sample in listSamples.index.values:
        metrics[sample] = pd.read_csv(f"3-DataMerge/1-CodeSampleAgregation/{sample}.csv", index_col=[0,1])
    return metrics

In [96]:
androidMetrics = loadMetrics(androidListSamples)
awsMetrics = loadMetrics(awsListSamples)
azureMetrics = loadMetrics(azureListSamples)
springMetrics = loadMetrics(springListSamples)

## Mann Kendall Test

Null Hypothesis: There is no monotonic trend.

Alternative Hypothesis: There is a trend.

If p-value < 0.05 then null Hypothesis is rejected.

In [97]:
pip install pymannkendall

Note: you may need to restart the kernel to use updated packages.


In [98]:
import pymannkendall as mk

## Normalize metrics

In [99]:
def normalize(metrics):
    return (metrics-metrics.min())/(metrics.max()-metrics.min())

In [100]:
def normalizeMetrics(metrics, listSamples):
    for sample in listSamples.index.values:
        metrics[sample] = normalize(metrics[sample])
    return metrics

In [101]:
androidNormalizedMetrics = normalizeMetrics(androidMetrics, androidListSamples)
awsNormalizedMetrics = normalizeMetrics(awsMetrics, awsListSamples)
azureNormalizedMetrics = normalizeMetrics(azureMetrics, azureListSamples)
springNormalizedMetrics = normalizeMetrics(springMetrics, springListSamples)

In [122]:
def extractMannKendallTest(metrics, listSamples):
    result = pd.DataFrame(index=listSamples.index)
    for sample in listSamples.index.values:
        sampleMetric = metrics[sample]
        for metric in sampleMetric.columns.values:
            try:
                mkResult = mk.original_test(sampleMetric[metric])
                result.loc[sample, metric+" has a trend?"] = mkResult.h
                result.loc[sample, metric+" trend is "] = mkResult.trend
                result.loc[sample, metric+" trend pvalue "] = format(mkResult.p, '.30f')

            except:
                mkResult = 0
    return result

In [123]:
androidMannKendallResult = extractMannKendallTest(androidNormalizedMetrics, androidListSamples)
awsMannKendallResult = extractMannKendallTest(awsNormalizedMetrics, awsListSamples)
azureMannKendallResult = extractMannKendallTest(azureNormalizedMetrics, azureListSamples)
springMannKendallResult = extractMannKendallTest(springNormalizedMetrics, springListSamples)

In [128]:
os.makedirs("4-StatisticalTests/1-CodeSample/mannKendallTest")

In [129]:
androidMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/android_mann_kendall_results.csv")
awsMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/aws_mann_kendall_results.csv")
azureMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/azure_mann_kendall_results.csv")
springMannKendallResult.to_csv("4-StatisticalTests/1-CodeSample/mannKendallTest/spring_mann_kendall_results.csv")

## Normality Test (Shapiro-Wilk)

Null Hyphotesis: The population has normal distribution

Alternative Hyphotesis: The population has not normal distribution

If p-value < 0.05 then null hyphotesis is rejected

## Correlation Test

Null Hypothesis: there is no correlation between data

Alternative Hypothesis: there is a correlation

If p-value < 0.05 then null Hypothesis is rejected.

If the metric is normalized we will apply the Pearson Test, otherwise we will apply Spearman Test

In [132]:
from scipy import stats

In [150]:
def extractCorrelationTest(metrics, listSamples):
    result = pd.DataFrame(index=listSamples.index)
    for sample in listSamples.index.values:
        sampleMetric = metrics[sample]
        for metric in sampleMetric.columns.values:
            #test if the set has normal distribution
            if(stats.shapiro(sampleMetric[metric]).pvalue < 0.05):
                correlationResult = stats.spearmanr(sampleMetric["questions"], sampleMetric[metric], nan_policy="omit")
                result.loc[sample, metric+" correlation"] = correlationResult[0]
                result.loc[sample, metric+" correlation pvalue"] = correlationResult[1]
            else:
                try:
                    correlationResult = stats.pearsonr(sampleMetric["questions"], sampleMetric[metric])
                    result.loc[sample, metric+" correlation"] = correlationResult.r
                    result.loc[sample, metric+" correlation pvalue"] = correlationResult.p-value
                except:
                    correlationResult = 0
    return result

In [153]:
androidCorrelationResult = extractCorrelationTest(androidNormalizedMetrics, androidListSamples)
awsCorrelationResult = extractCorrelationTest(awsNormalizedMetrics, awsListSamples)
azureCorrelationResult = extractCorrelationTest(azureNormalizedMetrics, azureListSamples)
springCorrelationResult = extractCorrelationTest(springNormalizedMetrics, springListSamples)

In [154]:
os.makedirs("4-StatisticalTests/1-CodeSample/correlationTest")

In [155]:
androidCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/android_correlation_results.csv")
awsCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/aws_correlation_results.csv")
azureCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/azure_correlation_results.csv")
springCorrelationResult.to_csv("4-StatisticalTests/1-CodeSample/correlationTest/spring_correlation_results.csv")