In [1]:
import json

import pandas as pd
import requests
from tqdm.notebook import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def create_package_space_from_project_json(filepath):
    package_space = set()
    project_data = json.load(open(filepath))
    for _, deps in project_data.items():
        for pkg in deps.keys():
            if pkg.startswith("npm:"):
                pkg = pkg[4:]
            package_space.add(pkg)
            
    return package_space

Number of packages: 16784


In [None]:
from bs4 import BeautifulSoup
import os

def create_package_list():
    res = requests.get("https://gist.github.com/anvaka/8e8fa57c7ee1350e3491")
    if res.status_code == 200:
        parsed_html = BeautifulSoup(res.content, "html.parser")
        selector = "#gistcomment-4447488 > div.edit-comment-hide > task-lists > div > p:nth-child(3)"
        package_list = parsed_html.select(selector)
        with open("./playground/package_list.txt", "w") as f:
            f.write(package_list[0].text)
            
def load_package_list():
    if not os.path.exists("./playground/package_list.txt"):
        create_package_list()
    else:
        with open("./playground/package_list.txt", "r") as f:
            return [text for text in f.read().splitlines()]
    
def create_package_space_from_github_gist():
    # packages = load_package_list()
    # print(len(packages))
    return set(load_package_list())
    
                
print(len(create_package_space_from_github_gist()))

In [None]:
pkg_space = create_package_space_from_project_json("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json")
print("Number of packages:", len(pkg_space))

In [83]:
from datetime import datetime, timedelta

def convert_time_to_epoch(timestamp):
    timestamp = timestamp.replace('Z', '+00:00')
    timestamp = datetime.fromisoformat(timestamp)
    return datetime.strftime(timestamp, '%s')
    

Number of packages: 11243


In [4]:
def fetch_package_data(package, base_url="https://registry.npmjs.org/", max_retries=3):
    for attempt in range(max_retries):
        try:
            res = requests.get(base_url + package)
            data = res.json()
            
            if res.status_code == 200:
                if 'dist-tags' not in data:
                    print(f"{package} is unpublished")
                    return {}
            
                latest_version = data["dist-tags"]["latest"]
                
                package_details = {
                    "latestVersion": latest_version,
                    "keywords": data.get("keywords", []),
                    "publishTime": convert_time_to_epoch(data["time"][latest_version]),
                    "description": data.get("description", ""),
                    "creationDate": data["time"]["created"],
                    "numberOfVersions": len(data["versions"]),
                    "dependencies": data["versions"][latest_version].get("dependencies", {}), 
                    "devDependencies": data["versions"][latest_version].get("devDependencies", {}), 
                    "peerDependencies": data["versions"][latest_version].get("peerDependencies", {}), 
                }
                return package_details
            
            else:
                print(f"{package} failed with status code {res.status_code}")
                return {}
                
        except requests.exceptions.ConnectionError as ce:
            if attempt < max_retries - 1:
                print(f"Failed to fetch {package} due to connection error: attempt {attempt + 1}")
            else:
                print("Failed to fetch {package}: max attempts exceeded")
                return {}
        except Exception as e:
            print("EXCEPTION OCCURED", package, e)

In [5]:
fetch_package_data("@types/recordrtc")

Layer 0:   0%|          | 0/16784 [00:00<?, ?it/s]

[ERROR] for install-test: 'dist-tags'. Package Unpublished
[ERROR] for vuex-vue3: 'dist-tags'. Package Unpublished


Layer 1:   0%|          | 0/1497 [00:00<?, ?it/s]

Layer 2:   0%|          | 0/809 [00:00<?, ?it/s]

Failed to get data for @tsparticles/interaction-particles-collisions: attempt 1
Failed to get data for @sanity/preview-kit-compat: attempt 1
Failed to get data for @sanity/mutate: attempt 1
Failed to get data for @vercel/stega: attempt 1
Failed to get data for @sanity/comlink: attempt 1
Failed to get data for use-effect-event: attempt 1
Failed to get data for @stdlib/complex-float32-base-add: attempt 1Failed to get data for @stdlib/complex-float32-base-assert: attempt 1

Failed to get data for @stdlib/complex-float32-base-mul: attempt 1
Failed to get data for json-alexander: attempt 1
Failed to get data for sync-message-port: attempt 1
Failed to get data for babel-plugin-react-require: attempt 1
Failed to get data for @babel/plugin-proposal-pipeline-operator: attempt 1


In [None]:
def limit_package_space(full_space, n):
    limit_space = set()
    for _ in range(n):
        limit_space.add(full_space.pop())
    return limit_space

In [None]:
def build_package_space(max_workers=20, max_iters=2, test_mode=False, src='project'):
    
    package_space = []
    
    if src == 'github':
        package_space = create_package_space_from_github_gist()
    elif src == 'project':
        package_space = create_package_space_from_project_json("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json")

    
    if test_mode:
        package_space = limit_package_space(package_space, 20)
    
    fetched = set()
    package_features = dict()
    dep_graph = dict()
    
    feat_lock = threading.Lock()
    graph_lock = threading.Lock()
    fetched_lock = threading.Lock()
    
    data_fields = [
        "latest_version",
        "keywords",
        "publish_time",
        "description",
        "creation_date", 
        "number_of_versions"
    ]
    
    # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
    # ----------------------------------------------------------------
    # dep_types = [
    #     ("dependencies", "PROD"), 
    #     ("devDependencies", "DEV"), 
    #     ("peerDependencies", "PEER")
    # ]
    
    next_iter = set()
    next_iter.update(package_space)
    # print(next_iter)
    
    curr_iter = 1
    
    while (max_iters == -1 and len(next_iter) > 0) or curr_iter <= max_iters:
        package_set, next_iter = next_iter, set()
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = { executor.submit(fetch_package_data, package): package for package in package_set }
            
            try: 
                for future in tqdm(as_completed(futures), total=len(futures), desc=f'Iteration {curr_iter}'):
                    pkg = futures[future]
                    package_data = future.result()
                    
                    with fetched_lock:
                        fetched.add(pkg)
                    
                    if package_data != {}:
                        with feat_lock, graph_lock:
                            package_features[pkg] = {field: package_data[field] for field in data_fields}
                            dep_graph[pkg] = package_data["dependencies"]
                            next_iter.update(package_data["dependencies"].keys())
    
                            # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
                            # ----------------------------------------------------------------
                            # dep = dict()
                            # for d_type, code in dep_types:
                            #     for d,v in package_data[d_type].items():
                            #         dep[d] = (v, code)
                            #         
                            # dep_graph[pkg] = dep
                            # 
                            # next_iter.update(package_data["dependencies"].keys())
                            # next_iter.update(package_data["devDependencies"].keys())
                            # next_iter.update(package_data["peerDependencies"].keys())
                            
                            
            except Exception as e:
                print(e)
                
        next_iter.difference_update(fetched)
        
        print(f"Next Iteration - {curr_iter + 1}: {len(next_iter)} packages")
        curr_iter += 1
    
    return package_features, dep_graph
            

In [None]:
feats, graph = build_package_space(max_iters=-1)

In [6]:
# divide into scoped and unscoped packages
scoped = []
unscoped = []
for package in feats.keys():
    if package.find("@") != -1:
        scoped.append((package, feats[package]["creationDate"]))
    else:
        unscoped.append(package)
print(f'scoped: {len(scoped)}\tunscoped: {len(unscoped)}')

scoped: 7558	unscoped: 10724


In [8]:
chunks = []
for i in range(0, len(unscoped), 128):
    chunks.append(unscoped[i:i + 128])
print(len(chunks))

59


In [9]:
# CAN GET DOWNLOAD COUNTS FOR UNSCOPED PACKAGES IN BULK
from collections import defaultdict

def get_all_time_download_counts_bulk(packages, format="%Y-%m-%d"):
    start_date = datetime(day=10, month=1, year=2015).date()
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=365), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
        
    download_counts = defaultdict(int)
    for interval in intervals:
        url = f"https://api.npmjs.org/downloads/point/{interval}/{','.join(packages)}"
        res = requests.get(url)
        if res.status_code == 200:
            download_data = res.json()
            for package in packages:
                download_counts[package] += download_data[package]["downloads"]
    
    return download_counts
    

In [10]:
with ThreadPoolExecutor() as executor:
    futures = { executor.submit(get_all_time_download_counts_bulk, chunk): chunk for chunk in chunks }
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Getting download counts for unscoped packages"):
        for package, count in future.result().items():
            feats[package]["download_count"] = count

Getting download counts for unscoped packages:   0%|          | 0/59 [00:00<?, ?it/s]

In [11]:
# FOR UNSCOPED, update download function to process one package concurrently
def get_year_downloads(package, interval):
    url = f"https://api.npmjs.org/downloads/point/{interval}/{package}"
    res = requests.get(url)
    if res.status_code == 200:
        return res.json()["downloads"]
    
    return 0


def get_all_time_download_count_concurrent(package, creation_date, format="%Y-%m-%d", max_workers=3):
    create_date = creation_date.replace('Z', '+00:00')
    create_date = datetime.fromisoformat(create_date)
    
    start_date = max(datetime(day=10, month=1, year=2015).date(), create_date.date())
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=547), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
            
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_year_downloads, package, interval): interval for interval in intervals }
        download_count = 0

        for future in futures:
            download_count += future.result()

        return download_count

# test function
count = get_all_time_download_count_concurrent("semver", "2011-02-12T00:20:25.690Z")
print(f'{count:,}')
    

49,999,748,223


In [12]:
def get_package_download_count(package, date, max_retries=3, max_workers=3):
    for attempt in range(max_retries):
        try:
            download_count = get_all_time_download_count_concurrent(package, date, max_workers=max_workers)
            return download_count
        except requests.exceptions.ConnectionError as e:
            if attempt < max_retries - 1:
                print(f'Failed getting count for {package}: attempt {attempt + 1}')
            else:
                print(f'Failed getting count for {package}: max retries exceeded')
    return -1

def get_download_counts_scoped_concurrent(packages, max_retries=3, max_workers=2, max_sub_workers=3):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_package_download_count, package, date, max_workers=max_sub_workers): package for package, date in packages }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Scoped Packages"):
            package = futures[future]
            download_count = future.result()
            feats[package]["download_count"] = download_count
            

In [13]:
get_download_counts_scoped_concurrent(scoped, max_workers=3, max_sub_workers=3)

Scoped Packages:   0%|          | 0/4923 [00:00<?, ?it/s]

Failed getting count for @visx/shape: 1


In [9]:
with open("./dataset/dep-graph.json", mode="w") as file:
    json.dump(graph, file, indent=4)

In [10]:
with open("./dataset/package-features.json", mode="w") as file:
    json.dump(feats, file, indent=4)

In [8]:
print(len(feats.keys()))
print(len(graph.keys()))

18282
18282


In [11]:
edges = []

for package, deps in graph.items():
    if len(deps) > 0:
        for dep, version in deps.items():
            edges.append((package, dep, version))

headers = ['package', 'dependency', 'version']
df = pd.DataFrame(edges, columns=headers)
df.head()

Unnamed: 0,package,dependency,version
0,read-pkg,@types/normalize-package-data,^2.4.3
1,read-pkg,normalize-package-data,^6.0.0
2,read-pkg,parse-json,^8.0.0
3,read-pkg,type-fest,^4.6.0
4,read-pkg,unicorn-magic,^0.1.0


In [12]:
len(df)

52738

In [13]:
df.to_csv("./dataset/dep-graph.csv", index=False)

In [None]:
feat_data = []
headers = ['package', 'latest_version', 'keywords', 'publish_time', 'description', 'creation_date', 'number_of_versions', 'download_count']
for package, feats in feats.items():
    feat_data.append([package] + list(feats.values()))

feat_df = pd.DataFrame(feat_data, columns=headers)
feat_df.head()

In [21]:
feat_df.to_csv("./dataset/package-feats.csv", index=False)

In [73]:
len(feat_df)

12370

## Pagerank

In [15]:
import pandas as pd
import networkx as nx
from scipy.sparse import csr_matrix

dep_df = pd.read_csv("./dataset/dep-graph.csv")

In [25]:
sources = set(dep_df['package'].unique())
deps = set(dep_df['dependency'].unique())
pkg_space = sources.union(deps)

print("Unique packages in packages column", len(sources))
print("Unique packages in dependency column", len(deps))
print("Total unique", len(pkg_space))

Unique packages in packages column 11553
Unique packages in dependency column 13159
Total unique 16508


In [17]:
no_out = {p for p, d in graph.items() if len(d) == 0}
print("Nodes with 0 out-degree:", len(no_out))

with_in = set()
for dependencies in graph.values():
    with_in.update(dependencies)

no_in = no_out - with_in

print("Nodes with 0 in-degree and 0 out-degree:", len(no_in))

json_str = json.dumps(json.load(open("./playground/dep-graph.json")))
from_original = [p for p in no_in if json_str.find(p) != -1]
print("Nodes with 0 in- and out-degree from source:", len(from_original))

In [None]:
# build adjacency matrix
G = nx.from_pandas_edgelist(dep_df, source='package', target='dependency', create_using=nx.DiGraph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

In [None]:
# Perform Pagerank
ranks = nx.pagerank(G, alpha=0.25)
ranks = list(ranks.items())
ranks.sort(key=lambda x: x[1], reverse=True)
ranks[:10]

In [None]:
from bs4 import BeautifulSoup

res = requests.get("https://gist.github.com/anvaka/8e8fa57c7ee1350e3491")

full_pagerank = []

if res.status_code != 200:
    print("Request failed")
else:
    parsed_html = BeautifulSoup(res.content, "html.parser")
    github_pagerank = parsed_html.select("#file-03-pagerank-md-readme > article > ol > li")
    full_pagerank = [(html.find('a').text, float(html.text.split(" - ")[1])) for html in github_pagerank]
    
full_pagerank[:10]

In [None]:
# Compare how many packages appear in both
rank_set = set([p for p,c in ranks[:1000]])
full_set = set([p for p,c in full_pagerank])
print(f"Number of common packages: {len(rank_set.intersection(full_set))}")

In [None]:
count = 0
for pkg, score in full_pagerank:
    if pkg not in pkg_space:
        count += 1
        print(pkg)
        
print(f"-------------\nCount = {count}")