In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

from joblib import Parallel,delayed

import pandas as pd

from joblib import Parallel,delayed

from pathlib import Path

from itertools import chain

import pickle

In [2]:
keys = [
    "HL","IL","PL","RL","CL",
    "diff_HL","diff_IL","diff_PL","diff_RL","diff_CL",
]

In [3]:
def extract_a_dataset(data_path):
    df = Parallel(n_jobs=-2)(
        delayed(
            lambda path:(path.name,pd.read_csv(path))
        )(x) for x in chain(
            (data_path/"MIT-CS1PD").rglob("*.csv"),
            (data_path/"MIT-CS2PD").rglob("*.csv"),
        )
    )
    df = dict(df)

    dataset = defaultdict(list)

    for val in df.values():
        for key in keys:
            dataset[key].append(val[key].values.astype("float32"))
        

    for key in keys:
        dataset[key] = np.concatenate(dataset[key])
        
    return dataset

In [4]:
dataset = extract_a_dataset(Path("processed_features"))

In [5]:
def statistics(dataset):
    results = {}
    for key in keys:
        values = dataset[key][~np.isnan(dataset[key])]
        q1,q2 = np.percentile(values,[25,75])
        IQR = q2-q1
        results[key] = {
            "mean":np.mean(values),
            "std":np.std(values),
            "median":np.median(values),
            "q1":q1,
            "q2":q2,
            "IQR":IQR
        }
    return results

In [6]:
results = statistics(dataset)

In [7]:
with open("statistics.pickle","wb") as f:
    pickle.dump(results,f)

In [8]:
results

{'HL': {'mean': 0.117848106,
  'std': 0.056892063,
  'median': 0.1039,
  'q1': 0.07819999754428864,
  'q2': 0.1437000036239624,
  'IQR': 0.06550000607967377},
 'IL': {'mean': 0.3548907,
  'std': 0.5548439,
  'median': 0.1771,
  'q1': 0.08299999684095383,
  'q2': 0.3898000121116638,
  'IQR': 0.30680001527071},
 'PL': {'mean': 0.47720197,
  'std': 0.5722912,
  'median': 0.2847,
  'q1': 0.18960000574588776,
  'q2': 0.5163999795913696,
  'IQR': 0.3267999738454819},
 'RL': {'mean': 0.4722305,
  'std': 0.56139344,
  'median': 0.2864,
  'q1': 0.1890999972820282,
  'q2': 0.5149000287055969,
  'IQR': 0.3258000314235687},
 'CL': {'mean': 0.5945419,
  'std': 0.5807334,
  'median': 0.3993,
  'q1': 0.29249998927116394,
  'q2': 0.6453999876976013,
  'IQR': 0.3528999984264374},
 'diff_HL': {'mean': -7.904568e-06,
  'std': 0.06403487,
  'median': 0.00019999966,
  'q1': -0.025699995458126068,
  'q2': 0.025600001215934753,
  'IQR': 0.05129999667406082},
 'diff_IL': {'mean': 0.0019486244,
  'std': 0.6787