# Information about the current dataset

This worksheet analyses some basic properties of the dataset we use.
We highlight numbers referred to in the paper by quoting the corresponding sentences from Section 5.1, which explains our experimental setup.

## Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import utils

In [2]:
selected_builds = utils.load_selected_builds()

## Number of Builds

The number of crates we managed to successfully build:

In [3]:
len(selected_builds)

10

## Crate Names

The most popular crate names are:

In [4]:
selected_builds.crate.value_counts()[:10]

crate
hashbrown       1
proc_macro2     1
base64          1
cfg_if          1
rand_core       1
syn             1
quote           1
libc            1
bitflags        1
regex_syntax    1
Name: count, dtype: int64

The crate names `build_script_<file>` indicate that the crate is a build script; `<file>` is the name of the actual build script without the `.rs` suffix. We have omited the build scripts from our analysis; that is why they are not shown in `selected_builds`.

The crate names that often repeat such as `main`, `example`, `test`, and `demo` typically belong to binaries as can be seen from the following frequence table:

In [5]:
selected_builds.query("crate_types=='bin'").crate.value_counts()

Series([], Name: count, dtype: int64)

## Crate Types

We first check what crate types exist:

In [6]:
pd.unique(selected_builds.crate_types)

array(['rlib'], dtype=object)

Count how many crates of each type we have:

In [7]:
import utils
crate_type_counts = utils.count_builds_per_type(selected_builds)
print(crate_type_counts)

{'rlib': 10}


The three builds with missing type are Rust files [generated](https://gitlab.com/tspiteri/rug/-/blob/834cbefbce178a67bd97cc93ab1f88f262bd6126/build.rs#L27-29) by the `rug` package build.rs file to check what features are supported by the Rust compiler.

In [8]:
selected_builds.query("crate_types == ''")

Unnamed: 0,build,package,version,crate,crate_hash,edition,crate_types


## Editions

In [9]:
selected_builds.edition.value_counts()

edition
2021    6
2018    3
2015    1
Name: count, dtype: int64

## `-sys` packages

Check what portion of `-sys` crates have matching non`-sys` crates:

In [10]:
# Validation: check that for each `-sys` crate there is one without the suffix.
def check_sys_crates():
    packages = utils.load_original_crates_list()
    assert len(packages[packages.isnull().any(axis=1)]) == 0
    all_crate_names = set(packages['package'])
    not_found = []
    found = []
    for crate_name in all_crate_names:
        if crate_name.endswith('-sys'):
            if crate_name[:-4] not in all_crate_names:
                not_found.append(crate_name)
            else:
                found.append(crate_name)
    print("Percent of -sys crates that have matching non-sys crates: {:.2f}%".format(
        100*len(found) / (len(found)+len(not_found))))
check_sys_crates()

ZeroDivisionError: division by zero

In [None]:
# Validation: check that for each `-sys` crate there is one wit the same prefix.
def check_sys_crates_relaxed():
    packages = utils.load_original_crates_list()
    assert len(packages[packages.isnull().any(axis=1)]) == 0
    all_crate_names = set(packages['package'])
    not_found = []
    found = []
    for crate_name in all_crate_names:
        if crate_name.endswith('-sys'):
            base_name = crate_name[:-4]
            # Quadratic... well
            if any(
                map(
                    lambda x: (
                        x != crate_name
                        and not x.endswith('-sys')
                        and x.startswith(base_name)
                    ),
                    all_crate_names
                )
            ):
                found.append(crate_name)
            else:
                not_found.append(crate_name)
    print("Percent of -sys crates that have relaxed-matching non-sys crates: {:.2f}%".format(
        100*len(found) / (len(found)+len(not_found))))
check_sys_crates_relaxed()

# Basics about Packages

> We evaluated our queries on a dataset that comprises the most-recent version (as of 2020-01-14) of all 34445 packages published on central Rust repository crates.io. The implementation of a package can be composed of multiple crates, one of which is usually primary and determines the name of the package.
> We excluded 5459 packages (16%) whose most recent version did not successfully compile.
> For packages with conditional compilation features, we used the default flags specified in the manifest.
> In cases when a package failed to compile with the default flags, but succeeded with different ones (when compiled as a dependency of another package) we selected a random build for analysis.
> As a result, our dataset consists of 31867 crates.

In [11]:
print("Number of original packages:", len(utils.load_original_crates_list()))
print("Number of compiling packages:", len(selected_builds.package.unique()),
      len(selected_builds.package.unique()) / len(utils.load_original_crates_list()))
print("Number of non-compiling packages:",
      len(utils.load_original_crates_list()) - len(selected_builds.package.unique()),
     (len(utils.load_original_crates_list()) - len(selected_builds.package.unique())) / len(utils.load_original_crates_list()))
print("Number of crates:", len(selected_builds))
print("Number of unique crate names (don't use this number):", len(selected_builds.crate.unique()))

Number of original packages: 10
Number of compiling packages: 10 1.0
Number of non-compiling packages: 0 0.0
Number of crates: 10
Number of unique crate names (don't use this number): 10


> Most of these crates are compiled to Rust libraries, namely 76%, or binaries, namely 20%.
The other crates are procedural macros (4%).

In [12]:
print("Crates")
print(selected_builds.crate_types.value_counts())
print()
print("Libs:",
      sum(selected_builds.crate_types.str.contains("lib")),
      sum(selected_builds.crate_types.str.contains("lib")) / len(selected_builds)
)
print("Bin:",
      sum(selected_builds.crate_types.str.contains("bin")),
      sum(selected_builds.crate_types.str.contains("bin")) / len(selected_builds)
)
print("proc-macro:",
      sum(selected_builds.crate_types.str.contains("proc-macro")),
      sum(selected_builds.crate_types.str.contains("proc-macro")) / len(selected_builds)
)
others = (
    ~selected_builds.crate_types.str.contains("bin")
    & ~selected_builds.crate_types.str.contains("proc-macro")
    & ~selected_builds.crate_types.str.contains("lib")
)
print("Others:",
      sum(others),
      sum(others) / len(selected_builds)
)

Crates
crate_types
rlib    10
Name: count, dtype: int64

Libs: 10 1.0
Bin: 0 0.0
proc-macro: 0 0.0
Others: 0 0.0


## Search for specific crates

In [13]:
from whitelists import *
print("sys crates:", len([
    x
    for x in selected_builds.crate
    if x.lower().replace("-", "_").endswith("_sys")
]))
print("sys crates (with manual):", len(get_sys_crate_names(selected_builds.crate.unique())))

sys crates: 0
sys crates (with manual): 12


In [14]:
print("hardware crates", len(hardware_crate_names))

hardware crates 10


In [16]:
from top_crates import *
print("top_500_crates:", len(set(top_500_crates)))

def norm(x):
    return x.lower().replace("-", "_")

top_500_compiling_crates = set(map(norm, top_500_crates)) & set(map(norm, selected_builds.package.unique()))
top_500_crates_set = set(top_500_crates)
top_500_mask = selected_builds.package.map(lambda x: x.replace("-", "_") in top_500_crates_set)
top_500_builds = selected_builds[top_500_mask][
    ~selected_builds[top_500_mask]["package"].duplicated()
]

print("top_500_packages compiling:", len(top_500_compiling_crates))

print("top_500_crates compiling:", len(top_500_builds))

print("Crates")
print(selected_builds.crate_types.value_counts())
print()
print("Libs:",
      sum(top_500_builds.crate_types.str.contains("rlib")),
      sum(top_500_builds.crate_types.str.contains("rlib")) / len(selected_builds)
)
print("Bin:",
      sum(top_500_builds.crate_types.str.contains("bin")),
      sum(top_500_builds.crate_types.str.contains("bin")) / len(selected_builds)
)
print("proc-macro:",
      sum(top_500_builds.crate_types.str.contains("proc-macro")),
      sum(top_500_builds.crate_types.str.contains("proc-macro")) / len(selected_builds)
)
others = (
    ~top_500_builds.crate_types.str.contains("bin")
    & ~top_500_builds.crate_types.str.contains("proc-macro")
    & ~top_500_builds.crate_types.str.contains("rlib")
)
print("Others:",
      sum(others),
      sum(others) / len(top_500_builds)
)

top_500_crates: 500
top_500_packages compiling: 10
top_500_crates compiling: 10
Crates
crate_types
rlib    10
Name: count, dtype: int64

Libs: 10 1.0
Bin: 0 0.0
proc-macro: 0 0.0
Others: 0 0.0
