In [5]:
import os
import random

import numpy as np
import pandas as pd
import scipy
from scipy import stats


In [6]:
# Generic config

# The location of the dataset, with all the sub-folders (one folder per metric)
root_dataset = "path/to/surfdataset"
store_location = "./SurfCoefficientsDask/"

# Create the store_location if not exists
os.makedirs(store_location, exist_ok=True)

In [62]:
def correlate_two_dfs(df, df2, coef_file):
    import logging
    logger = logging.getLogger(__name__)
    try:
        # merge them on index and node
        merged = df.merge(df2, on=['index', 'node'], copy=False, how='inner')
        del df2  # Reduce RAM usage
        merged.drop(['index', 'node'], axis=1, inplace=True)
        merged.dropna(inplace=True) # Drop rows with nan in df1 or df2
        
        # Get 1k samples to speed up the computation significantly, following https://link.springer.com/article/10.1007/BF02294183 this should be more than sufficient.
        
        # Using the pandas dataframe, compute the Pearson, Spearman, and Kendall correlations + pvalues
        values = [scipy.stats.pearsonr(merged['df1'], merged['df2']), 
                  scipy.stats.spearmanr(merged['df1'], merged['df2']), 
                  scipy.stats.kendalltau(merged['df1'], merged['df2']),
                 ]

        correlations = [val[0] for val in values]
        pvalues = [val[1] for val in values]
        
        if any([abs(val) >= 0.8 for val in correlations]):
            print(coef_file)

        with open(coef_file, "w") as file1:
            file1.write("|".join([str(x) for x in correlations]))
            file1.write("\n")
            file1.write("|".join([str(x) for x in pvalues]))

    except Exception:
        logger.exception("Fatal error while computing correlations")

In [None]:
# Get all folder names in the root_dataset
folders = next(os.walk(root_dataset))[1]
random.shuffle(folders)  # Shuffle the folders so that if we run another node in parallel, the chance on collisions is very small

for i in range(len(folders)):
    # Make sure you run this with pyarrow 1.0 or higher, 0.17 gives an error for directories which was fixed in 1.0!
    if not os.path.isfile(os.path.join(root_dataset, folders[i], "1579474800_1579561185.parquet")): continue
    df = pd.read_parquet(os.path.join(root_dataset, folders[i], "1579474800_1579561185.parquet"))
    if isinstance(df.columns, pd.MultiIndex):  # For GPU nodes that have a multi-index, average the value of the cards
        df = df.groupby(axis=1, level=0).mean()


    # Make the index a date index
    df.index = pd.to_datetime(df.index, unit="s")

    # Make sure to replace invalid values with nan so we can filter them later.
    df.replace(-1, np.nan, inplace=True)

    # Convert the dataframe to long form using melt
    df.reset_index(inplace=True)
    df = df.melt(id_vars=['index'], var_name="node", value_name="df1")

    # To enable comparisons, convert values to doubles
    # Not sure if this is needed - Exception: ArrowTypeError('fields had matching names but differing types. From: r1123n7: int64 To: r1123n7: double')
    df = df.astype({"df1": np.double}, copy=False)
    
    for j in range(i+1, len(folders)):
        coef_file = os.path.join(store_location, "{}+{}_correlations.csv".format(folders[i], folders[j]))
    
        
        if os.path.isfile(coef_file): continue
        if not os.path.isfile(os.path.join(root_dataset, folders[j], "1579474800_1579561185.parquet")): continue

        # Same for dir 2
        df2 = pd.read_parquet(os.path.join(root_dataset, folders[j], "1579474800_1579561185.parquet"))
        if isinstance(df2.columns, pd.MultiIndex):  # For GPU nodes that have a multi-index, average the value of the cards
            df2 = df2.groupby(axis=1, level=0).mean()

        df2.index = pd.to_datetime(df2.index, unit="s")
        df2.replace(-1, np.nan, inplace=True)
        df2.reset_index(inplace=True)
        df2 = df2.melt(id_vars=['index'], var_name="node", value_name="df2")
        df2 = df2.astype({"df2": np.double}, copy=False)
        
        correlate_two_dfs(df, df2, coef_file)
