# Exploring .so sizes in Pyodide for scikit-learn

As an example of a package with lots of Cython extensions

Setup
```bash
pip install auditwheel-emscripten
wget https://cdn.jsdelivr.net/pyodide/v0.24.1/debug/scikit_learn-1.3.1-cp311-cp311-emscripten_3_1_45_wasm32.whl
mkdir -p data/wasm
mv scikit_learn*wasm32.whl data/wasm/
cd data/wasm
unzip scikit_learn
```


## Looking at a single _random.so

In [9]:
from auditwheel_emscripten.emscripten_tools.webassembly import Module, SecType

In [2]:
m = Module("data/wasm/sklearn/utils/_random.cpython-311-wasm32-emscripten.so")

In [3]:
list(m.sections())

[Section(type=<SecType.CUSTOM: 0>, size=17, offset=10, name='dylink.0'),
 Section(type=<SecType.TYPE: 1>, size=134, offset=30, name=None),
 Section(type=<SecType.IMPORT: 2>, size=4326, offset=167, name=None),
 Section(type=<SecType.FUNCTION: 3>, size=128, offset=4496, name=None),
 Section(type=<SecType.GLOBAL: 6>, size=8, offset=4626, name=None),
 Section(type=<SecType.EXPORT: 7>, size=115, offset=4636, name=None),
 Section(type=<SecType.ELEM: 9>, size=156, offset=4754, name=None),
 Section(type=<SecType.CODE: 10>, size=97474, offset=4914, name=None),
 Section(type=<SecType.DATA: 11>, size=21920, offset=102392, name=None),
 Section(type=<SecType.CUSTOM: 0>, size=44, offset=124314, name='target_features')]

In [4]:
m.parse_features_section()

[('+', 'mutable-globals'), ('+', 'sign-ext')]

In [5]:
m.parse_dylink_section()

Dylink(mem_size=21912, mem_align=4, table_size=75, table_align=0, needed=[], export_info={}, import_info={})

In [7]:
m.get_exports()

[Export(name='__wasm_call_ctors', kind=<ExternType.FUNC: 0>, index=136),
 Export(name='__wasm_apply_data_relocs', kind=<ExternType.FUNC: 0>, index=137),
 Export(name='PyInit__random', kind=<ExternType.FUNC: 0>, index=138),
 Export(name='__pyx_module_is_main_sklearn__utils___random', kind=<ExternType.GLOBAL: 3>, index=38)]

In [8]:
m.get_imports()

[Import(kind=<ExternType.FUNC: 0>, module='env', field='PyModuleDef_Init', type=2),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyDict_Size', type=2),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='_PyDict_GetItem_KnownHash', type=1),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyErr_Occurred', type=4),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyErr_Format', type=1),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyDict_Next', type=5),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyUnicode_Compare', type=0),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyLong_AsLong', type=2),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='_Py_Dealloc', type=3),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyThreadState_Get', type=4),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='PyErr_Fetch', type=6),
 Import(kind=<ExternType.FUNC: 0>, module='env', field='_PyObject_GetDictPtr', type=2),
 Impor

In [11]:
code_sec = m.get_section(SecType.CODE)

In [12]:
code_sec

Section(type=<SecType.CODE: 10>, size=97474, offset=4914, name=None)

In [14]:
functions = m.get_functions()

In [19]:
functions

[FunctionBody(offset=4916, size=2),
 FunctionBody(offset=4920, size=6509),
 FunctionBody(offset=11430, size=11),
 FunctionBody(offset=11443, size=565),
 FunctionBody(offset=12010, size=524),
 FunctionBody(offset=12535, size=112),
 FunctionBody(offset=12649, size=963),
 FunctionBody(offset=13614, size=1582),
 FunctionBody(offset=15198, size=728),
 FunctionBody(offset=15928, size=3667),
 FunctionBody(offset=19597, size=727),
 FunctionBody(offset=20326, size=3849),
 FunctionBody(offset=24177, size=732),
 FunctionBody(offset=24911, size=3091),
 FunctionBody(offset=28004, size=868),
 FunctionBody(offset=28874, size=2399),
 FunctionBody(offset=31275, size=290),
 FunctionBody(offset=31567, size=143),
 FunctionBody(offset=31712, size=222),
 FunctionBody(offset=31935, size=39),
 FunctionBody(offset=31976, size=328),
 FunctionBody(offset=32306, size=146),
 FunctionBody(offset=32453, size=85),
 FunctionBody(offset=32539, size=94),
 FunctionBody(offset=32634, size=81),
 FunctionBody(offset=32717, 

## Analyzing all .so in scikit-learn

In [70]:
## Doing the work
from typing import Any
import hashlib
import dataclasses
import hashlib
from pathlib import Path
import re
from io import BytesIO
import pandas as pd
import functools

@dataclasses.dataclass
class WasmFunction:
    """A container to handle metadata and bytecode for a WASM function

    The hash of this object is the hash of the bytecode.
    """
    name: str
    offset: str
    size: int
    body: bytes

    @property
    def hash(self):
        s = BytesIO(self.body)
        digest = hashlib.file_digest(s, "sha1")
        return digest.hexdigest()

    def __hash__(self):
        return int(self.hash, 16)

    def __repr__(self):
        return f'WasmFunction({self.name}, size={self.size})'
        
db : dict[str, FunctionHash] = {}

def get_name_from_path(path: Path, base_name: str):
    """Return a nicer name from the path"""
    parts = list(path.parts)
    idx = parts.index(base_name)
    if idx is not None:
        parts = parts[idx:]
    name = '.'.join(parts)
    name = re.sub(r'\.cpython-3\d\d-wasm32.*', '', name)
    return name


def parse_functions(mod: Module, mod_name: str) -> list[WasmFunction]:
    """Parse all functions in a module"""
    res = []
    for idx, func in enumerate(mod.get_functions()):
        body = mod.read_at(func.offset, func.size)
        res.append(WasmFunction(name=f'{mod_name}:func{idx}',
                                body=body,
                                offset=func.offset,
                                size=func.size))
    return res
        
    



def process_module(path: Path, db: dict[str, FunctionHash], base_name: str="sklearn") -> dict[str, Any]:
    with Module(path) as mod:
        mod_name = get_name_from_path(path, base_name)

        funcs = parse_functions(mod, mod_name)
        res = { "module_name": mod_name,
                'module_size': mod.size/1000,
                'code_section_size': mod.get_section(SecType.CODE).size / 1000,
                'data_section_size': mod.get_section(SecType.DATA).size / 1000,
                'n_funcs': len(mod.get_functions()), 
                'funcs': funcs}

    
    return res


res = []
for path in Path('data/wasm/sklearn').glob('**/*.so'):
    
    res_el = process_module(path, db)
    res.append(res_el)
    


df = pd.DataFrame(res).sort_values('module_size')
del df['funcs']
df

Unnamed: 0,module_name,module_size,code_section_size,data_section_size,n_funcs
55,sklearn.utils._heap,7.212,4.363,1.199,8
1,sklearn.__check_build._check_build,7.610,4.681,1.215,6
61,sklearn.utils._sorting,8.382,5.504,1.191,8
41,sklearn.neighbors._partition_nodes,8.623,5.742,1.199,7
48,sklearn.svm._newrand,12.723,6.518,4.183,11
...,...,...,...,...,...
12,sklearn.datasets._svmlight_format_fast,315.733,279.798,29.760,249
51,sklearn.tree._tree,321.393,273.824,41.528,241
30,sklearn.metrics._dist_metrics,407.142,333.929,67.200,367
67,sklearn.utils.sparsefuncs_fast,456.191,418.526,32.152,201


In [82]:
total_whl_size = sum(path.stat().st_size for path in Path('data/wasm/sklearn').glob('**/*'))
print(total_whl_size/1000)

18997.906


In [83]:
df['module_size'].sum()

12041.390000000001

See if there any duplicate function content by hashing and counting the number of occurences

In [71]:
from collections import Counter
import itertools

global_db = Counter(list(itertools.chain.from_iterable([row['funcs'] for row in res])))

In [79]:
max(global_db.values())

1

there are no duplicates