In [20]:
import json
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep

In [2]:
url = "https://gist.github.com/anvaka/8e8fa57c7ee1350e3491"
res = requests.get(url)
data = res.content
# print(BeautifulSoup(data, 'html.parser').prettify())

In [3]:
parsedData = BeautifulSoup(data, "html.parser")

In [11]:
# Fetches from github rank page. No need to run since we are generating our own package space
topDependencies = parsedData.select("#file-01-most-dependent-upon-md-readme > article > ol > li")
selectors = [
    '#file-01-most-dependent-upon-md-readme > article > ol > li',
    '#file-02-with-most-dependencies-md-readme > article > ol > li',
    '#file-03-pagerank-md-readme > article > ol > li'
]
packages = set()
top_k = 25
for selector in selectors:
    topPackages = parsedData.select(selector)
    for html in topPackages[:top_k]:
        packages.add(html.find('a').text)
packages


{'3.0.0',
 '@ericmcornelius/ease',
 '@ngxvoice/ngx-voicelistner',
 '@rstacruz/pnpm',
 'async',
 'axios',
 'axios-yet-another-proxy-fix',
 'babel-runtime',
 'bloater',
 'bluebird',
 'chalk',
 'classnames',
 'colors',
 'commander',
 'debug',
 'digital-keyboard-demos',
 'express',
 'fhir2',
 'fs-extra',
 'glob',
 'inquirer',
 'js-tokens',
 'lodash',
 'loose-envify',
 'merino',
 'miguelcostero-ng2-toasty',
 'minimist',
 'mkdirp',
 'moment',
 'ms',
 'no-one-left-behind',
 'object-assign',
 'pb-schema',
 'primeng-custom',
 'prop-types',
 'react',
 'react-angular-component',
 'react-dom',
 'react-misc-toolbox',
 'react-native-handcheque-engine',
 'react-native-ok-sdk',
 'react-native-template-vife',
 'react-native-version-manager',
 'react-redux-demo1',
 'readable-stream',
 'regenerator-runtime',
 'request',
 'safe-buffer',
 'search-list-react',
 'sindresorhus.js',
 'tslib',
 'underscore',
 'uuid',
 'vue',
 'vue-compment',
 'wc-starterkit',
 'webche',
 'webpack',
 'yargs'}

In [8]:
# constructing package space from repository data
package_space = set()
project_data = json.load(open("project_packages.json"))
for _, deps in project_data.items():
    for pkg in deps.keys():
        if pkg.startswith("npm:"):
            pkg = pkg[4:]
        package_space.add(pkg)
        
print("Number of packages:", len(package_space))
packages = package_space

Number of packages: 10089


In [6]:
from datetime import datetime, timedelta

def convert_time_to_epoch(timestamp):
    timestamp = timestamp.replace('Z', '+00:00')
    timestamp = datetime.fromisoformat(timestamp)
    return datetime.strftime(timestamp, '%s')
    

In [11]:
# THIS IS CONCURRENT. MUCH FASTER
from tqdm.notebook import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

package_set = set()
package_features = dict()
dep_graph = dict()
failed_packages = []
MAX_DEPTH = 1

# Locks for shared data structures
set_lock = threading.Lock()
features_lock = threading.Lock()
graph_lock = threading.Lock()

# Function to fetch package data
def get_package_features(package, base_url="https://registry.npmjs.org/", depth=0):
    # Check if not already obtained data for this package
    with set_lock:
        if package in package_set:
            return

        package_set.add(package)  # Add package to set to prevent duplicate processing
    
    res = requests.get(base_url + package)
    jsonData = res.json()
    
    if res.status_code == 200:
        try:
            # Obtain latest version
            latestVersion = jsonData["dist-tags"]["latest"]
            
            # Handle dependencies
            deps = jsonData["versions"][latestVersion].get("dependencies", {})
            with graph_lock:
                dep_graph[package] = deps
            
            # If within depth limit, fetch dependencies in parallel
            if depth < MAX_DEPTH:
                with ThreadPoolExecutor() as executor:
                    futures = [executor.submit(get_package_features, dep, base_url, depth + 1) for dep in deps]
                    for future in as_completed(futures):
                        future.result()  # Trigger any exceptions if they occurred

            # Collect metadata
            package_data = {
                "latestVersion": latestVersion,
                "keywords": jsonData.get("keywords", []),
                "publishTime": convert_time_to_epoch(jsonData["time"][latestVersion]),
                "creationDate": jsonData["time"]["created"],
                "numberOfVersions": len(jsonData["versions"]),
            }
            with features_lock:
                package_features[package] = package_data

        except Exception as e:
            print(f"[ERROR] for {package}: {e}")
    else:
        failed_packages.append(f"Package unavailable: {package}\n")

# Initial call to fetch top-level packages in parallel
def fetch_all_packages(packages):
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(get_package_features, package) for package in packages]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing packages"):
            try:
                future.result()  # Retrieve result or raise any exceptions
            except Exception as e:
                print(f"Failed to process package: {e}")
            
fetch_all_packages(packages)

Processing packages:   0%|          | 0/10089 [00:00<?, ?it/s]

In [14]:
with open("failed_packages.txt", "w") as f:
    f.writelines(failed_packages)

In [36]:
def get_all_time_download_count(package, creation_date, format = "%Y-%m-%d", max_retries=3, delay=2):
    create_date = creation_date.replace('Z', '+00:00')
    create_date = datetime.fromisoformat(create_date)
    
    start_date = max(datetime(day=10, month=1, year=2015).date(), create_date.date())
    curr_date = datetime.now().date()
    
    # divide into intervals of 1 year
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=365), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
    
    download_count = 0
    for interval in intervals:
        url = f"https://api.npmjs.org/downloads/point/{interval}/{package}"
        res = requests.get(url)
        if res.status_code == 200:
            download_count += res.json()["downloads"]

    return download_count

count = get_all_time_download_count("semver", "2011-02-12T00:20:25.690Z")
print(f'{count:,}')
# 49,942,378,647

50,006,212,360


In [40]:
def update_downloads(packages_dict):
    # Function to run in parallel for each package
    def process_package(package, details, max_retries=3, delay=1):
        for attempt in range(max_retries):
            try:
                download_count = get_all_time_download_count(package, details["creationDate"], format = "%Y-%m-%d")
                details["download_count"] = download_count
                break
            except requests.exceptions.RequestException:
                print(f"Failed to get download count for {package}: Attempt {attempt + 1}")
                if attempt < max_retries - 1: 
                    sleep(delay)
                else:
                    print(f"Max retries exceeded for {package}")
    
    with ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(process_package, package, details): package 
            for package, details in packages_dict.items()
        }
        
        # Monitor progress
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing packages"):
            package = futures[future]
            try:
                future.result()
            except Exception as e:
                print(f"Failed to get download count for {package}: {e}")

update_downloads(package_features)

Processing packages:   0%|          | 0/11124 [00:00<?, ?it/s]

In [57]:
# Retry for packages that were not completed in first round due to HTTPConnectionPool error
for package, feats in package_features.items():
    if "download_count" not in feats:
        print(package)
        package_features[package]["download_counts"] = get_all_time_download_count(package, package_features[package]["creationDate"])

In [47]:
with open("./dataset/dep-graph.json", mode="w") as file:
    json.dump(dep_graph, file, indent=4)

In [48]:
with open("./dataset/package-features.json", mode="w") as file:
    json.dump(package_features, file, indent=4)

In [49]:
print(len(package_features.keys()))
print(len(dep_graph.keys()))

11124
11124


In [50]:
edges = []

for package, deps in dep_graph.items():
    if len(deps) > 0:
        for dep, version in deps.items():
            edges.append((package, dep, version))

headers = ['package', 'dependency', 'version']
df = pd.DataFrame(edges, columns=headers)
df.head()

Unnamed: 0,package,dependency,version
0,minimist-options,arrify,^1.0.1
1,minimist-options,is-plain-obj,^1.1.0
2,minimist-options,kind-of,^6.0.3
3,atomically,stubborn-fs,^1.2.5
4,atomically,when-exit,^2.1.1


In [55]:
len(df)

31983

In [52]:
df.to_csv("./dataset/dep-graph.csv")

In [51]:
feat_data = []
headers = ['package', 'latest_version', 'keywords', 'publish_time', 'creation_date', 'number_of_versions', 'downloads']
for package, feats in package_features.items():
    feat_data.append([package] + list(feats.values()))

feat_df = pd.DataFrame(feat_data, columns=headers)
feat_df.head()

Unnamed: 0,package,latest_version,keywords,publish_time,creation_date,number_of_versions,downloads
0,atomically,2.0.3,"[atomic, read, write, file, reliable]",1712238627,2020-07-02T00:22:58.370Z,14,69537822
1,@fontsource/monofett,5.1.0,"[fontsource, font, font family, google fonts, ...",1726116813,2020-12-23T22:21:26.182Z,42,68891
2,debounce-fn,6.0.0,"[debounce, function, debouncer, fn, func, thro...",1699187981,2014-08-04T22:46:13.182Z,13,77911966
3,kind-of,6.0.3,"[arguments, array, boolean, check, date, funct...",1579165159,2014-09-26T08:08:38.913Z,26,30221557144
4,ajv-formats,3.0.1,"[Ajv, JSON-Schema, format, validation]",1711769426,2020-01-14T17:57:34.978Z,36,2035572702


In [53]:
feat_df.to_csv("./dataset/package-feats.csv")

In [56]:
len(feat_df)

11124

#### Versioning Info

- `~version` **“Approximately equivalent to version”**, will automatically update you to all future patch versions that are backwards-compatible, without incrementing the minor version. `~1.2.3` will use releases from `1.2.3` to `< 1.3.0`.


- `^version` **“Compatible with version”**, will automatically update you to all future minor/patch versions that are backwards-compatible, without incrementing the major version. `^1.2.3` will use releases from `1.2.3` to `< 2.0.0`.