In [2]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import tabulate
from IPython.display import HTML, display

import common


DATASET = Path('../experiments')

def d(p):
    x, y = common.load_text_distribution(p)
    return common.Distribution(x, y, '', '', '')

distributions = {
    'In-degrees': [
        ("Full", d(DATASET / 'inout/full_in.txt')),
        ("Filesystem", d(DATASET / 'inout/dir+cnt_in.txt')),
        ("Commit", d(DATASET / 'inout/rev_in.txt')),
        ("History", d(DATASET / 'inout/rel+rev_in.txt')),
        ("Hosting", d(DATASET / 'inout/ori+snp_in.txt')),
    ],
    'Out-degrees': [
        ("Full", d(DATASET / 'inout/full_out.txt')),
        ("Filesystem", d(DATASET / 'inout/dir+cnt_out.txt')),
        ("Commit", d(DATASET / 'inout/rev_out.txt')),
        ("History", d(DATASET / 'inout/rel+rev_out.txt')),
        ("Hosting", d(DATASET / 'inout/ori+snp_out.txt')),
    ],
    'Connected components': [
        ("Full", d(DATASET / 'connectedcomponents/full/distribution.txt')),
        ("Filesystem", d(DATASET / 'connectedcomponents/dir+cnt/distribution.txt')),
        ("Commit", d(DATASET / 'connectedcomponents/rev/distribution.txt')),
        ("History", d(DATASET / 'connectedcomponents/rel+rev/distribution.txt')),
        ("Hosting", d(DATASET / 'connectedcomponents/ori+snp/distribution.txt')),
    ],
    'Clustering coefficient': [
        ("Full", d(DATASET / 'clusteringcoeff/distribution-full.txt')),
        ("Filesystem", d(DATASET / 'clusteringcoeff/distribution-dircnt.txt')),
        ("Commit", d(DATASET / 'clusteringcoeff/distribution-rev.txt')),
        ("History", d(DATASET / 'clusteringcoeff/distribution-relrev.txt')),
        # ("Hosting", d(DATASET / 'clusteringcoeff/distribution-orisnp.txt')),
    ],
    'Shortest path': [
        ("Filesystem", d(DATASET / 'shortestpath/dir+cnt/distribution.txt')),
        ("Commit", d(DATASET / 'shortestpath/rev/distribution.txt')),
    ]
}



['In-degrees', 'Full', 19330739526.0, 1.86533254168527, 8.476185347059282, 10.145289846721042]
['In-degrees', 'Filesystem', 17050437427.0, 1.8629536354077088, 8.476185347059282, 10.027260026665726]
['In-degrees', 'Commit', 1976476233.0, 2.2045714713990536, 5.840033915243118, 9.232992228645648]
['In-degrees', 'History', 1993015770.0, 2.147624534699223, 5.840033915243118, 9.231551335105111]
['In-degrees', 'Hosting', 287286329.0, 2.762557978532109, 7.033493196839657, 8.168813493069143]
['Out-degrees', 'Full', 19330739526.0, 1.9475222180971221, 6.014187339865851, 9.962911293268462]
['Out-degrees', 'Filesystem', 17050437427.0, 1.9468287770883057, 6.014187339865851, 9.961687475988702]
['Out-degrees', 'Commit', 1976476233.0, 5.80822454592631, 5.0, 9.243940579995702]
['Out-degrees', 'History', 1993015770.0, 5.80822454592631, 5.0, 9.24801606017141]
['Out-degrees', 'Hosting', 287286329.0, 2.2061366957388877, 4.9867135259505, 8.223868817220255]
['Connected components', 'Full', 33104255.0, 2.37898

Algorithm,Layer,Number of objects,Scaling parameter,X decades,Y decades
In-degrees,Full,19330700000.0,1.86533,8.47619,10.1453
In-degrees,Filesystem,17050400000.0,1.86295,8.47619,10.0273
In-degrees,Commit,1976480000.0,2.20457,5.84003,9.23299
In-degrees,History,1993020000.0,2.14762,5.84003,9.23155
In-degrees,Hosting,287286000.0,2.76256,7.03349,8.16881
Out-degrees,Full,19330700000.0,1.94752,6.01419,9.96291
Out-degrees,Filesystem,17050400000.0,1.94683,6.01419,9.96169
Out-degrees,Commit,1976480000.0,5.80822,5.0,9.24394
Out-degrees,History,1993020000.0,5.80822,5.0,9.24802
Out-degrees,Hosting,287286000.0,2.20614,4.98671,8.22387


## Data integrity: object statistics

In [6]:
headers = ["Algorithm", "Layer", "Number of objects", "Scaling parameter", "X decades", "Y decades"]
table = []
for algo_name, algo_distributions in distributions.items():
    # table.append([f'{algo_name}'])
    for name, distribution in algo_distributions:
        row = [
            algo_name,
            name,
            np.sum(distribution.y),
            distribution.fitted_power(),
            np.log10(np.max(distribution.x)),
            np.log10(np.max(distribution.y)),
        ]
        table.append(row)

display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html')))

Algorithm,Layer,Number of objects,Scaling parameter,X decades,Y decades
In-degrees,Full,19330700000.0,1.86533,8.47619,10.1453
In-degrees,Filesystem,17050400000.0,1.86295,8.47619,10.0273
In-degrees,Commit,1976480000.0,2.20457,5.84003,9.23299
In-degrees,History,1993020000.0,2.14762,5.84003,9.23155
In-degrees,Hosting,287286000.0,2.76256,7.03349,8.16881
Out-degrees,Full,19330700000.0,1.94752,6.01419,9.96291
Out-degrees,Filesystem,17050400000.0,1.94683,6.01419,9.96169
Out-degrees,Commit,1976480000.0,5.80822,5.0,9.24394
Out-degrees,History,1993020000.0,5.80822,5.0,9.24802
Out-degrees,Hosting,287286000.0,2.20614,4.98671,8.22387


## Data integrity: in and out degrees

This data helps getting an overview of the graph properties and check whether it is consistent to our expectations as a way to perform data integrity checks.

Here are a few examples of criteria that can be checked on the following table:

- The number of objects computed from the distributions (= the sum of the second column) is always the same in all distributions starting from the same object type. For instance, dir_in_* and dir_out_* all have the same number of objects.
- The average in/outdegree of a given object type is consistent when each neighbor type is looked independently and when they are all aggregated together (e.g. the average degree of dir_out_all is a weighted average of the average degrees of the dir_out_{cnt,dir,rev} distributions).
- The number of objects with a total indegree of 0 should be small in all types of objects that are supposed to be reachable from the upper layers of the graph.
- Some specific per-layer indegrees are expected to be relatively small compared to the total number of objects (e.g. most revisions do not have an associated release)

In [27]:
inout_per_type = [
    'cnt_in_dir',
    'dir_in_all',
    'dir_in_dir',
    'dir_in_rev',
    'dir_out_all',
    'dir_out_cnt',
    'dir_out_dir',
    'dir_out_rev',
    'ori_out_snp',
    'rel_in_snp',
    'rev_in_all',
    'rev_in_dir',
    'rev_in_rel',
    'rev_in_rev',
    'rev_in_snp',
    'rev_out_rev',
    'snp_in_ori',
    'snp_out_all',
    'snp_out_rel',
    'snp_out_rev',
]

headers = ["Node type", "Direction", "Neighbor type", "# Nodes", "# Edges", "Avg degree", "# Lowest degree", "# Second-lowest"]
table = []
for name in inout_per_type:
    dist = d(DATASET / f'inout/per_type/{name}.txt')
    src, direction, dst = name.split('_')
    row = [
        # ("Indegrees" if direction == 'in' else "Outdegrees"),
        common.types_verbose[src],
        ("← in ←" if direction == 'in' else "→ out →"),
        common.types_verbose[dst],
        int(np.sum(dist.y)),
        int(np.sum(dist.x * dist.y)),
        np.sum(dist.x * dist.y) / np.sum(dist.y),
        f'{int(dist.y[0])} ({int(dist.x[0])})',
        f'{int(dist.y[1])} ({int(dist.x[1])})',
        # }   :  ratio= {int(100*np.sum(x*y)/np.sum(y))/100:<6} :    {int(x[0])} =>{int(y[0]):16,}   {int(x[1])} =>{int(y[1]):16,}')
    ]
    table.append(row)

display(HTML(tabulate.tabulate(table, headers=headers, tablefmt='html')))

Node type,Direction,Neighbor type,# Nodes,# Edges,Avg degree,# Lowest degree,# Second-lowest
contents,← in ←,directories,9152847293,143786784566,15.7095,5978249005 (1),1098223970 (2)
directories,← in ←,everything,7897590134,65200402547,8.25573,1343830 (0),6134767929 (1)
directories,← in ←,directories,7897590134,63229213027,8.00614,1607262793 (0),4669554466 (1)
directories,← in ←,revisions,7897590134,1971187167,0.249594,6261880169 (0),1504272429 (1)
directories,→ out →,everything,7897590134,207805470722,26.3125,557087 (0),1713055834 (1)
directories,→ out →,contents,7897590134,143786781408,18.2064,1787869540 (0),1421143792 (1)
directories,→ out →,directories,7897590134,63229213027,8.00614,2753589255 (0),1734567306 (1)
directories,→ out →,revisions,7897590134,789473873,0.0999639,7860017187 (0),23267141 (1)
origins,→ out →,snapshots,147453557,189314705,1.28389,22710546 (0),77244971 (1)
releases,← in ←,snapshots,16539537,700135072,42.331,427531 (0),4408973 (1)
