## Benchmark MMTF Hadoop Sequence File fingerprinting

In [1]:
import pandas as pd
import papermill as pm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import psutil
n_cores = psutil.cpu_count(logical=True)
n_cpus = psutil.cpu_count(logical=False)
print("cores:", n_cores)
print("cpus: ", n_cpus)

cores: 4
cpus:  2


In [3]:
dataset = "full"
coresets = [1,2,3] + list(range(4, n_cores+1, 4))
print(coresets)

[1, 2, 3, 4]


In [4]:
for cores in coresets:
    pm.execute_notebook(
        'tasks/Fingerprint.ipynb',
        '../output/Fingerprint_' + str(cores) + "_" + dataset + '.ipynb',
        parameters = dict(cores=cores, path='../data/' + dataset)
    )

Input Notebook:  tasks/Fingerprint.ipynb
Output Notebook: ../output/Fingerprint_1_full.ipynb
100%|██████████| 7/7 [02:54<00:00, 24.98s/it]
Input Notebook:  tasks/Fingerprint.ipynb
Output Notebook: ../output/Fingerprint_2_full.ipynb
100%|██████████| 7/7 [01:46<00:00, 15.16s/it]
Input Notebook:  tasks/Fingerprint.ipynb
Output Notebook: ../output/Fingerprint_3_full.ipynb
100%|██████████| 7/7 [01:48<00:00, 15.56s/it]
Input Notebook:  tasks/Fingerprint.ipynb
Output Notebook: ../output/Fingerprint_4_full.ipynb
100%|██████████| 7/7 [01:50<00:00, 15.81s/it]


In [5]:
nbs = pm.read_notebooks('../output/')
nbs.dataframe

Unnamed: 0,name,value,type,filename,key
0,cores,1,parameter,Filter_1_full.ipynb,Filter_1_full.ipynb
1,path,../data/full,parameter,Filter_1_full.ipynb,Filter_1_full.ipynb
2,filter,62.5429,record,Filter_1_full.ipynb,Filter_1_full.ipynb
3,filter_total,80.7885,record,Filter_1_full.ipynb,Filter_1_full.ipynb
4,cores,2,parameter,Filter_2_full.ipynb,Filter_2_full.ipynb
5,path,../data/full,parameter,Filter_2_full.ipynb,Filter_2_full.ipynb
6,filter,35.8053,record,Filter_2_full.ipynb,Filter_2_full.ipynb
7,filter_total,54.0278,record,Filter_2_full.ipynb,Filter_2_full.ipynb
8,cores,3,parameter,Filter_3_full.ipynb,Filter_3_full.ipynb
9,path,../data/full,parameter,Filter_3_full.ipynb,Filter_3_full.ipynb


In [6]:
df = nbs.dataframe.pivot(index='key', columns='name', values='value')
df["cores"] = pd.to_numeric(df["cores"])
df["fingerprint"] = pd.to_numeric(df["fingerprint"])
df

name,cores,filter,filter_total,fingerprint,fingerprint_total,flatmap,flatmap_total,path,read,read_total
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Filter_1_full.ipynb,1,62.5429,80.7885,,,,,../data/full,,
Filter_2_full.ipynb,2,35.8053,54.0278,,,,,../data/full,,
Filter_3_full.ipynb,3,36.6299,54.9289,,,,,../data/full,,
Filter_4_full.ipynb,4,37.406,55.6387,,,,,../data/full,,
Fingerprint_1_full.ipynb,1,,,154.365622,172.562,,,../data/full,,
Fingerprint_2_full.ipynb,2,,,85.638164,103.817,,,../data/full,,
Fingerprint_3_full.ipynb,3,,,88.336952,106.637,,,../data/full,,
Fingerprint_4_full.ipynb,4,,,90.052061,108.383,,,../data/full,,
Flatmap_1_full.ipynb,1,,,,,226.961,245.348,../data/full,,
Flatmap_2_full.ipynb,2,,,,,122.753,141.234,../data/full,,


In [7]:
ds = df.filter(like='Fingerprint', axis=0)
ds.to_csv("../results/Fingerprint.csv")