In [6]:
import json

import pandas as pd
import requests
from tqdm.notebook import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [17]:
# constructing package space from github repository data
package_space = set()
project_data = json.load(open("project_packages.json"))
for _, deps in project_data.items():
    for pkg in deps.keys():
        if pkg.startswith("npm:"):
            pkg = pkg[4:]
        package_space.add(pkg)
        
print("Number of packages:", len(package_space))
packages = package_space

Number of packages: 10089


In [18]:
from datetime import datetime, timedelta

def convert_time_to_epoch(timestamp):
    timestamp = timestamp.replace('Z', '+00:00')
    timestamp = datetime.fromisoformat(timestamp)
    return datetime.strftime(timestamp, '%s')
    

In [35]:
MAX_DEPTH = 1

package_set = set()
package_features = dict()
dep_graph = dict()
failed_packages = []
layers = {i: [] for i in range(MAX_DEPTH + 1)}
layers[0] = list(package_space)

# Locks for shared data structures
set_lock = threading.Lock()
features_lock = threading.Lock()
graph_lock = threading.Lock()
layers_lock = threading.Lock()

def get_package_details(package, base_url="https://registry.npmjs.org/", depth=0):
   # Check if not already obtained data for this package
    with set_lock:
        if package in package_set:
            return

        package_set.add(package)  # Add package to set to prevent duplicate processing
    
    res = requests.get(base_url + package)
    jsonData = res.json()
    
    if res.status_code == 200:
        try:
            # Obtain latest version
            latestVersion = jsonData["dist-tags"]["latest"]
            
            # Handle dependencies
            deps = jsonData["versions"][latestVersion].get("dependencies", [])
            with graph_lock:
                dep_graph[package] = deps
            
            # If within depth limit, fetch dependencies
            if depth < MAX_DEPTH:
                with layers_lock:
                    for dep in deps:
                        if dep not in package_set:
                            layers[depth+1].append(dep)
                
            # Collect metadata
            package_data = {
                "latestVersion": latestVersion,
                "keywords": jsonData.get("keywords", []),
                "publishTime": convert_time_to_epoch(jsonData["time"][latestVersion]),
                "description": jsonData.get("description", ""),
                "creationDate": jsonData["time"]["created"],
                "numberOfVersions": len(jsonData["versions"]),
            }
            
            with features_lock:
                package_features[package] = package_data

        except Exception as e:
            print(f"[ERROR] for {package}: {e}")
    else:
        failed_packages.append(f"Package unavailable: {package}\n")
    

In [37]:
for i in range(MAX_DEPTH + 1):
    # filter out all the packages in the next layer that have already been fetched
    layers[i] = [pkg for pkg in layers[i] if pkg not in package_set]
    
    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = { executor.submit(get_package_details, package, depth=i):package for package in layers[i] }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Layer {i}"):
            future.result()
        

Layer 0:   0%|          | 0/10089 [00:00<?, ?it/s]

Layer 1:   0%|          | 0/1391 [00:00<?, ?it/s]

In [39]:
# divide into scoped and unscoped packages
scoped = []
unscoped = []
for package in package_features.keys():
    if package.find("@") != -1:
        scoped.append((package, package_features[package]["creationDate"]))
    else:
        unscoped.append(package)
print(f'scoped: {len(scoped)}\tunscoped: {len(unscoped)}')

scoped: 4431	unscope: 6803


In [40]:
chunks = []
for i in range(0, len(unscoped), 128):
    chunks.append(unscoped[i:i + 128])
print(len(chunks))

54


In [41]:
# CAN GET DOWNLOAD COUNTS FOR UNSCOPED PACKAGES IN BULK
from collections import defaultdict

def get_all_time_download_counts_bulk(packages, format = "%Y-%m-%d"):
    start_date = datetime(day=10, month=1, year=2015).date()
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=365), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
        
    download_counts = defaultdict(int)
    for interval in intervals:
        url = f"https://api.npmjs.org/downloads/point/{interval}/{','.join(packages)}"
        res = requests.get(url)
        if res.status_code == 200:
            download_data = res.json()
            for package in packages:
                download_counts[package] += download_data[package]["downloads"]
    
    return download_counts
    

In [42]:
with ThreadPoolExecutor() as executor:
    futures = { executor.submit(get_all_time_download_counts_bulk, chunk): chunk for chunk in chunks }
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Getting download counts for unscoped packages"):
        for package, count in future.result().items():
            package_features[package]["download_count"] = count

Getting download counts for unscoped packages:   0%|          | 0/54 [00:00<?, ?it/s]

In [64]:
# FOR UNSCOPED, update download function to process one package concurrently
def get_year_downloads(package, interval):
    url = f"https://api.npmjs.org/downloads/point/{interval}/{package}"
    res = requests.get(url)
    if res.status_code == 200:
        return res.json()["downloads"]
    
    return 0


def get_all_time_download_count_concurrent(package, creation_date, format="%Y-%m-%d", max_workers=3):
    create_date = creation_date.replace('Z', '+00:00')
    create_date = datetime.fromisoformat(create_date)
    
    start_date = max(datetime(day=10, month=1, year=2015).date(), create_date.date())
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=547), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
            
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_year_downloads, package, interval): interval for interval in intervals }
        download_count = 0

        for future in futures:
            download_count += future.result()

        return download_count

# test function
count = get_all_time_download_count_concurrent("semver", "2011-02-12T00:20:25.690Z")
print(f'{count:,}')
    

49,999,748,223


In [52]:
def get_package_download_count(package, date, max_retries=3, max_workers=3):
    for attempt in range(max_retries):
        try:
            download_count = get_all_time_download_count_concurrent(package, date, max_workers=max_workers)
            return download_count
        except requests.exceptions.ConnectionError as e:
            if attempt < max_retries - 1:
                print(f'Failed getting count for {package}: {attempt + 1}')
            else:
                print(f'Failed getting count for {package}: max retries exceeded')
    return -1

def get_download_counts_scoped_concurrent(packages, max_retries=3, max_workers=2, max_sub_workers=3):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_package_download_count, package, date, max_workers=max_sub_workers): package for package, date in packages }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Scoped Packages"):
            package = futures[future]
            download_count = future.result()
            package_features[package]["download_count"] = download_count
            

In [54]:
get_download_counts_scoped_concurrent(scoped, max_workers=3, max_sub_workers=3)

Scoped Packages:   0%|          | 0/4431 [00:00<?, ?it/s]

In [55]:
with open("./dataset/dep-graph.json", mode="w") as file:
    json.dump(dep_graph, file, indent=4)

In [56]:
with open("./dataset/package-features.json", mode="w") as file:
    json.dump(package_features, file, indent=4)

In [57]:
print(len(package_features.keys()))
print(len(dep_graph.keys()))

11234
11234


In [58]:
edges = []

for package, deps in dep_graph.items():
    if len(deps) > 0:
        for dep, version in deps.items():
            edges.append((package, dep, version))

headers = ['package', 'dependency', 'version']
df = pd.DataFrame(edges, columns=headers)
df.head()

Unnamed: 0,package,dependency,version
0,chromium-bidi,mitt,3.0.1
1,chromium-bidi,urlpattern-polyfill,10.0.0
2,chromium-bidi,zod,3.23.8
3,get-intrinsic,es-errors,^1.3.0
4,get-intrinsic,function-bind,^1.1.2


In [59]:
len(df)

32281

In [60]:
df.to_csv("./dataset/dep-graph.csv", index=False)

In [61]:
feat_data = []
headers = ['package', 'latest_version', 'keywords', 'publish_time', 'description', 'creation_date', 'number_of_versions', 'download_count']
for package, feats in package_features.items():
    feat_data.append([package] + list(feats.values()))

feat_df = pd.DataFrame(feat_data, columns=headers)
feat_df.head()

Unnamed: 0,package,latest_version,keywords,publish_time,description,creation_date,number_of_versions,download_count
0,chromium-bidi,0.9.1,[],1730859566,An implementation of the WebDriver BiDi protoc...,2022-04-28T08:05:03.963Z,72,206302772
1,get-intrinsic,1.2.4,"[javascript, ecmascript, es, js, intrinsic, ge...",1707168726,Get and robustly cache all JS language-level i...,2020-10-30T15:03:39.549Z,12,6992609115
2,inherits,2.0.4,"[inheritance, class, klass, oop, object-orient...",1560946732,Browser-friendly inheritance fully compatible ...,2011-04-07T00:35:14.848Z,7,18362723250
3,map-age-cleaner,0.2.0,"[map, age, cleaner, maxage, expire, expiration...",1629721812,Automatically cleanup expired items in a Map,2018-08-21T18:21:55.611Z,5,1780602259
4,magic-string,0.30.12,"[string, string manipulation, sourcemap, templ...",1728603240,"Modify strings, generate sourcemaps",2014-11-04T16:24:39.084Z,116,3948486326


In [62]:
feat_df.to_csv("./dataset/package-feats.csv", index=False)

In [63]:
len(feat_df)

11234