In [3]:
import json
import math

import pandas as pd
import requests
from tqdm.notebook import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [97]:
def filter_valid_package_names(packages):
    packages = set(packages)
    remove = set()
    for package in packages:
        if (not package.startswith('@')) and (not package[0].islower()):
            remove.add(package)
                
    return packages - remove

# test function
print(filter_valid_package_names({'../cli', 'axios', '@blah/blah', '345'}))

{'axios', '@blah/blah'}


In [98]:
def create_package_space_from_project_json(filepath):
    package_space = set()
    project_data = json.load(open(filepath))
    for _, deps in project_data.items():
        for pkg in deps.keys():
            if pkg.startswith("npm:"):
                pkg = pkg[4:]
            package_space.add(pkg)
            
    return package_space

In [28]:
from bs4 import BeautifulSoup
import os

def create_package_list():
    res = requests.get("https://gist.github.com/anvaka/8e8fa57c7ee1350e3491")
    if res.status_code == 200:
        parsed_html = BeautifulSoup(res.content, "html.parser")
        selector = "#gistcomment-4447488 > div.edit-comment-hide > task-lists > div > p:nth-child(3)"
        package_list = parsed_html.select(selector)
        with open("./playground/package_list.txt", "w") as f:
            f.write(package_list[0].text)
            
def load_package_list():
    if not os.path.exists("./playground/package_list.txt"):
        create_package_list()
    else:
        with open("./playground/package_list.txt", "r") as f:
            return [text for text in f.read().splitlines()]
    
def create_package_space_from_github_gist():
    # packages = load_package_list()
    # print(len(packages))
    return set(load_package_list())
    
                
print(len(create_package_space_from_github_gist()))

5248


In [29]:
pkg_space = create_package_space_from_project_json("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json")
print("Number of packages:", len(pkg_space))

Number of packages: 16784


In [31]:
from datetime import datetime, timedelta

def convert_time_to_epoch(timestamp):
    timestamp = timestamp.replace('Z', '+00:00')
    timestamp = datetime.fromisoformat(timestamp)
    return datetime.strftime(timestamp, '%s')
    

In [42]:
def fetch_package_data(package, base_url="https://registry.npmjs.org/", max_retries=3):
    for attempt in range(max_retries):
        try:
            res = requests.get(base_url + package)
            data = res.json()
            
            if res.status_code == 200:
                if 'dist-tags' not in data:
                    print(f"{package} is unpublished")
                    return {}
            
                latest_version = data["dist-tags"]["latest"]
                
                package_details = {
                    "latest_version": latest_version,
                    "keywords": data.get("keywords", []),
                    "publish_time": convert_time_to_epoch(data["time"][latest_version]),
                    "description": data.get("description", ""),
                    "creation_date": data["time"]["created"],
                    "number_of_versions": len(data["versions"]),
                    "dependencies": data["versions"][latest_version].get("dependencies", {}), 
                    "devDependencies": data["versions"][latest_version].get("devDependencies", {}), 
                    "peerDependencies": data["versions"][latest_version].get("peerDependencies", {}), 
                }
                
                return package_details
            
            else:
                print(f"{package} failed with status code {res.status_code}")
                return {}
                
        except requests.exceptions.ConnectionError as ce:
            if attempt < max_retries - 1:
                print(f"Failed to fetch {package} due to connection error: attempt {attempt + 1}")
            else:
                print("Failed to fetch {package}: max attempts exceeded")
                return {}
        except Exception as e:
            print("EXCEPTION OCCURED", package, e)

In [99]:
fetch_package_data("@types/recordrtc")

{'latest_version': '5.6.14',
 'keywords': [],
 'publish_time': '1699340797',
 'description': 'TypeScript definitions for recordrtc',
 'creation_date': '2020-08-06T00:41:07.911Z',
 'number_of_versions': 15,
 'dependencies': {},
 'devDependencies': {},
 'peerDependencies': {}}

In [44]:
def limit_package_space(full_space, n):
    limit_space = set()
    for _ in range(n):
        limit_space.add(full_space.pop())
    return limit_space

In [100]:
def build_package_space(max_workers=20, max_iters=2, test_mode=False, src='project'):
    
    package_space = []
    
    if src == 'github':
        package_space = create_package_space_from_github_gist()
    elif src == 'project':
        package_space = create_package_space_from_project_json("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json")

    
    if test_mode:
        package_space = limit_package_space(package_space, 20)
        
    package_space = filter_valid_package_names(package_space)
    
    fetched = set()
    package_features = dict()
    dep_graph = dict()
    
    feat_lock = threading.Lock()
    graph_lock = threading.Lock()
    fetched_lock = threading.Lock()
    
    data_fields = [
        "latest_version",
        "keywords",
        "publish_time",
        "description",
        "creation_date", 
        "number_of_versions"
    ]
    
    # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
    # ----------------------------------------------------------------
    # dep_types = [
    #     ("dependencies", "PROD"), 
    #     ("devDependencies", "DEV"), 
    #     ("peerDependencies", "PEER")
    # ]
    
    next_iter = set()
    next_iter.update(package_space)
    # print(next_iter)
    
    curr_iter = 1
    
    while (max_iters == -1 and len(next_iter) > 0) or curr_iter <= max_iters:
        package_set, next_iter = next_iter, set()
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = { executor.submit(fetch_package_data, package): package for package in package_set }
            
            try: 
                for future in tqdm(as_completed(futures), total=len(futures), desc=f'Iteration {curr_iter}'):
                    pkg = futures[future]
                    package_data = future.result()
                    
                    with fetched_lock:
                        fetched.add(pkg)
                    
                    if package_data != {}:
                        with feat_lock, graph_lock:
                            package_features[pkg] = {field: package_data[field] for field in data_fields}
                            deps = {d: package_data["dependencies"][d] for d in filter_valid_package_names(package_data["dependencies"])}
                            dep_graph[pkg] = deps
                            next_iter.update(deps.keys())
    
                            # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
                            # ----------------------------------------------------------------
                            # dep = dict()
                            # for d_type, code in dep_types:
                            #     for d,v in package_data[d_type].items():
                            #         dep[d] = (v, code)
                            #         
                            # dep_graph[pkg] = dep
                            # 
                            # next_iter.update(package_data["dependencies"].keys())
                            # next_iter.update(package_data["devDependencies"].keys())
                            # next_iter.update(package_data["peerDependencies"].keys())
                            
                            
            except Exception as e:
                print(e)
                
        next_iter.difference_update(fetched)
        
        print(f"Next Iteration - {curr_iter + 1}: {len(next_iter)} packages")
        curr_iter += 1
    
    return package_features, dep_graph
            

In [101]:
feats, graph = build_package_space(max_iters=-1)

Iteration 1:   0%|          | 0/16773 [00:00<?, ?it/s]

@hcengineering/task failed with status code 404
@vue/test-utils-vue3 failed with status code 404
@dataspherestudio/shared failed with status code 404
@hcengineering/model-chunter failed with status code 404
@hcengineering/print-assets failed with status code 404
@hcengineering/highlight failed with status code 404
@hcengineering/presentation failed with status code 404
@hcengineering/desktop-preferences failed with status code 404
@hcengineering/login-resources failed with status code 404
@tryghost/staff-service failed with status code 404
pdfjs-dist-v4 failed with status code 404
@hcengineering/request-resources failed with status code 404
@hcengineering/server-recruit failed with status code 404
@hcengineering/model-inventory failed with status code 404
@tryghost/adapter-cache-redis failed with status code 404
@hcengineering/model-tracker failed with status code 404
@hcengineering/training failed with status code 404
@hcengineering/notification failed with status code 404
@tryghost/l

Iteration 2:   0%|          | 0/1326 [00:00<?, ?it/s]

cbw-sdk failed with status code 404
@azure/functions-old failed with status code 404
@babel/traverse--for-generate-function-map failed with status code 404
@vue/vue-loader-v15 failed with status code 404
Next Iteration - 3: 604 packages


Iteration 3:   0%|          | 0/604 [00:00<?, ?it/s]

@stdlib/complex-float64-base-assert failed with status code 404
typescript-5.7 failed with status code 404
typescript-5.5 failed with status code 404
typescript-5.8 failed with status code 404
typescript-5.4 failed with status code 404
typescript-5.0 failed with status code 404
typescript-5.1 failed with status code 404
typescript-5.3 failed with status code 404
typescript-4.9 failed with status code 404
typescript-5.6 failed with status code 404
typescript-5.2 failed with status code 404
Next Iteration - 4: 208 packages


Iteration 4:   0%|          | 0/208 [00:00<?, ?it/s]

Next Iteration - 5: 35 packages


Iteration 5:   0%|          | 0/35 [00:00<?, ?it/s]

Next Iteration - 6: 21 packages


Iteration 6:   0%|          | 0/21 [00:00<?, ?it/s]

Next Iteration - 7: 3 packages


Iteration 7:   0%|          | 0/3 [00:00<?, ?it/s]

Next Iteration - 8: 1 packages


Iteration 8:   0%|          | 0/1 [00:00<?, ?it/s]

Next Iteration - 9: 0 packages


In [102]:
# divide into scoped and unscoped packages
scoped = []
unscoped = []
for package in feats.keys():
    if package.find("@") != -1:
        scoped.append((package, feats[package]["creation_date"]))
    else:
        unscoped.append(package)
print(f'scoped: {len(scoped)}\tunscoped: {len(unscoped)}')

scoped: 7672	unscoped: 10867


In [103]:
chunks = []
for i in range(0, len(unscoped), 128):
    chunks.append(unscoped[i:i + 128])
print(len(chunks))

85


In [105]:
# CAN GET DOWNLOAD COUNTS FOR UNSCOPED PACKAGES IN BULK

def get_all_time_download_counts_bulk(packages, format="%Y-%m-%d"):
    start_date = datetime(day=10, month=1, year=2015).date()
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=365), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
        
    download_counts = {package: 0 for package in packages}
    for interval in intervals:
        url = f"https://api.npmjs.org/downloads/point/{interval}/{','.join(packages)}"
        res = requests.get(url)
        if res.status_code == 200:
            download_data = res.json()
            for package in packages:
                try:
                    download_counts[package] += download_data[package]["downloads"]
                except Exception as e:
                    print(f"[ERROR] getting download count for {package}")
    
    return download_counts
    

In [106]:
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = { executor.submit(get_all_time_download_counts_bulk, chunk): chunk for chunk in chunks }
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Getting download counts for unscoped packages"):
        for package, count in future.result().items():
            feats[package]["download_count"] = count

Getting download counts for unscoped packages:   0%|          | 0/85 [00:00<?, ?it/s]

In [107]:
print(f'{feats["semver"]["download_count"]:,}')

50,318,547,289


In [108]:
# FOR UNSCOPED, update download function to process one package concurrently
def get_year_downloads(package, interval):
    url = f"https://api.npmjs.org/downloads/point/{interval}/{package}"
    res = requests.get(url)
    if res.status_code == 200:
        return res.json()["downloads"]
    
    return 0


def get_all_time_download_count_concurrent(package, creation_date, format="%Y-%m-%d", max_workers=3):
    create_date = creation_date.replace('Z', '+00:00')
    create_date = datetime.fromisoformat(create_date)
    
    start_date = max(datetime(day=10, month=1, year=2015).date(), create_date.date())
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=547), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
            
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_year_downloads, package, interval): interval for interval in intervals }
        download_count = 0

        for future in futures:
            download_count += future.result()

        return download_count

# test function
count = get_all_time_download_count_concurrent("semver", "2011-02-12T00:20:25.690Z")
print(f'{count:,}')
    

50,312,083,152


In [109]:
def get_package_download_count(package, date, max_retries=3, max_workers=3):
    for attempt in range(max_retries):
        try:
            download_count = get_all_time_download_count_concurrent(package, date, max_workers=max_workers)
            return download_count
        except requests.exceptions.ConnectionError as e:
            if attempt < max_retries - 1:
                print(f'Failed getting count for {package}: attempt {attempt + 1}')
            else:
                print(f'Failed getting count for {package}: max retries exceeded')
    return -1

def get_download_counts_scoped_concurrent(packages, max_retries=3, max_workers=2, max_sub_workers=3):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_package_download_count, package, date, max_workers=max_sub_workers): package for package, date in packages }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Scoped Packages"):
            package = futures[future]
            download_count = future.result()
            feats[package]["download_count"] = download_count
            

In [112]:
get_download_counts_scoped_concurrent(scoped, max_workers=4, max_sub_workers=3)

Scoped Packages:   0%|          | 0/7672 [00:00<?, ?it/s]

In [113]:
with open("./dataset/dep-graph.json", mode="w") as file:
    json.dump(graph, file, indent=4)

In [114]:
with open("./dataset/package-features.json", mode="w") as file:
    json.dump(feats, file, indent=4)

In [None]:
def save_dataset(features, edges, feature_filename="./dataset/package-features.json", edges_filename="./dataset/dep-graph.json"):
    with open(feature_filename, "w") as feat_file:
        print(f"Saving features to {feature_filename}")
        json.dump(features, feat_file, indent=4)
    with open(edges_filename, "w") as edge_file:
        print(f"Saving edges to {edges_filename}")
        json.dump(edges, edge_file, indent=4)
    

In [115]:
print(len(feats.keys()))
print(len(graph.keys()))

18539
18539


In [116]:
edges = []

for package, deps in graph.items():
    if len(deps) > 0:
        for dep, version in deps.items():
            edges.append((package, dep, version))

headers = ['package', 'dependency', 'version']
df = pd.DataFrame(edges, columns=headers)
df.head()

Unnamed: 0,package,dependency,version
0,@floating-ui/react,@floating-ui/utils,^0.2.8
1,@floating-ui/react,tabbable,^6.0.0
2,@floating-ui/react,@floating-ui/react-dom,^2.1.2
3,from2-string,from2,^2.0.3
4,github-url-to-object,is-url,^1.1.0


In [117]:
len(df)

53208

In [59]:
df.to_csv("./dataset/dep-graph.csv", index=False)

In [None]:
feat_data = []
headers = ['package', 'latest_version', 'keywords', 'publish_time', 'description', 'creation_date', 'number_of_versions', 'download_count']
for package, feats in feats.items():
    feat_data.append([package] + list(feats.values()))

feat_df = pd.DataFrame(feat_data, columns=headers)
feat_df.head()

In [21]:
feat_df.to_csv("./dataset/package-feats.csv", index=False)

In [73]:
len(feat_df)

12370

## Pagerank

In [4]:
import pandas as pd
import networkx as nx
import json
from scipy.sparse import csr_matrix

dep_df = pd.read_csv("./dataset/dep-graph.csv")
graph = json.load(open("./dataset/dep-graph.json"))
feats = json.load(open("./dataset/package-features.json"))

In [3]:
sources = set(dep_df['package'].unique())
deps = set(dep_df['dependency'].unique())
pkg_space = sources.union(deps)

print("Unique packages in packages column", len(sources))
print("Unique packages in dependency column", len(deps))
print("Total unique", len(pkg_space))

Unique packages in packages column 11717
Unique packages in dependency column 13220
Total unique 16568


In [2]:
no_out = {p for p, d in graph.items() if len(d) == 0}
print("Nodes with 0 out-degree:", len(no_out))

with_in = set()
for dependencies in graph.values():
    with_in.update(dependencies)

no_in = no_out - with_in

print("Nodes with 0 in-degree and 0 out-degree:", len(no_in))

json_str = json.dumps(json.load(open("./playground/dep-graph.json")))
from_original = [p for p in no_in if json_str.find(p) != -1]
print("Nodes with 0 in- and out-degree from source:", len(from_original))

Nodes with 0 out-degree: 6829
Nodes with 0 in-degree and 0 out-degree: 2000
Nodes with 0 in- and out-degree from source: 296


In [26]:
# build adjacency matrix
G = nx.from_pandas_edgelist(dep_df, source='package', target='dependency', create_using=nx.Graph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

Adjacency Matrix shape: (16568, 16568)
G: Graph with 16568 nodes and 53258 edges


In [10]:
# build adjacency matrix using project data
from itertools import combinations
import math

def filter_top_by_download_count(packages, top=500):
    packages.sort(key=lambda x: feats[x]['download_count'], reverse=True)
    return packages[:top]

def create_co_dependency_graph(projects):
    edges = []
    
    for project, data in tqdm(projects.items()):
        deps = [p for p in data.keys() if p in feats]
        deps = filter_top_by_download_count(deps)
        print(project)
        
        for dep1, dep2 in combinations(deps, 2):
            edges.append((dep1, dep2)) 
    
    # Convert the set of edges to a list
    return list(edges)

blacklist = ['pinpoint']
project_data = json.load(open("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json"))

for p in blacklist:
    del project_data[p]

edges = create_co_dependency_graph(project_data)
edge_df = pd.DataFrame(edges, columns=['from', 'to'])
edge_df

  0%|          | 0/138 [00:00<?, ?it/s]

vue-element-admin
grafana
strapi
AdminLTE
metabase
tabler
ant-design-pro
tesseract.js
hugo
Ghost
halo
directus
payload
highlight.js
filament
ccxt
medusa
arangodb
spree
reaction
vue-storefront
nopCommerce
simplefolio
minimal-mistakes
purescript
hugo-blox-builder
next.js
gatsby
astro
hexo
gpt4free
dify
autogen
Langchain-Chatchat
fuels-ts
Alamofire
nsq
LibreChat
commitlint
discourse
NodeBB
lemmy
incubator-answer
freeCodeCamp
mastodon
freecodecamp.cn
awesome-react-native
AppFlowy
AFFiNE
outline
trilium
wiki
OI-wiki
platform
nvm
bun
Motrix
pm2
AnotherRedisDesktopManager
pnpm
superset
spark
ClickHouse
coolify
redash
redux
immich
zustand
lobe-chat
query
react-hook-form
fullcalendar
react-dates
tui.calendar
react-native-calendars
CS-Notes
joplin
wtfjs
memos
carbon-lang
plane
wekan
openproject
odoo
erpnext
yudao-cloud
monica
twenty
posthog
oh-my-posh
postal
GoJS
curlconverter
jss
super-expressive
Open-Assistant
You-Dont-Need-jQuery
WordPress
dom-to-image
webtorrent
mediapipe
srs
video.js
YesPla

Unnamed: 0,from,to
0,chalk,chokidar
1,chalk,core-js
2,chalk,path-to-regexp
3,chalk,axios
4,chalk,eslint
...,...,...
11435739,@babel/plugin-transform-modules-amd,@babel/plugin-syntax-optional-chaining
11435740,@babel/plugin-transform-modules-amd,@jest/environment
11435741,@babel/plugin-transform-unicode-regex,@babel/plugin-syntax-optional-chaining
11435742,@babel/plugin-transform-unicode-regex,@jest/environment


In [11]:
edge_df = edge_df.drop_duplicates()
edge_df.to_csv("./dataset/edges.csv", index=False)

In [12]:
print(len(edge_df))
sources = set(edge_df['from'].unique())
deps = set(edge_df['to'].unique())
pkg_space = sources.union(deps)

print("Unique packages in packages column", len(sources))
print("Unique packages in dependency column", len(deps))
print("Total unique", len(pkg_space))

1507491
Unique packages in packages column 3945
Unique packages in dependency column 3990
Total unique 3992


In [13]:
G = nx.from_edgelist(edges, create_using=nx.Graph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

Adjacency Matrix shape: (3992, 3992)
G: Graph with 3992 nodes and 1507491 edges


In [14]:
# Perform Pagerank
ranks = nx.pagerank(G, alpha=0.85)
ranks = list(ranks.items())
ranks.sort(key=lambda x: x[1], reverse=True)
ranks[:10]

[('semver', 0.0012549531686317577),
 ('debug', 0.0011545357658411233),
 ('eslint', 0.0011046437111414242),
 ('chalk', 0.001093829590955619),
 ('glob', 0.0010598184336379599),
 ('lodash', 0.001050234619218721),
 ('once', 0.0010305969157313757),
 ('inherits', 0.0009913011304626192),
 ('escape-string-regexp', 0.0009862946819898044),
 ('wrappy', 0.0009831095378090037)]

In [15]:
from bs4 import BeautifulSoup

res = requests.get("https://gist.github.com/anvaka/8e8fa57c7ee1350e3491")

full_pagerank = []

if res.status_code != 200:
    print("Request failed")
else:
    parsed_html = BeautifulSoup(res.content, "html.parser")
    github_pagerank = parsed_html.select("#file-03-pagerank-md-readme > article > ol > li")
    full_pagerank = [(html.find('a').text, float(html.text.split(" - ")[1])) for html in github_pagerank]
    
full_pagerank[:10]

[('lodash', 0.010834060106227733),
 ('tslib', 0.0073019517213535595),
 ('request', 0.0057722307215751405),
 ('debug', 0.0054921316717398815),
 ('prop-types', 0.005007180072168596),
 ('ms', 0.004915039436489446),
 ('chalk', 0.004908854475195843),
 ('object-assign', 0.004052459060484455),
 ('minimist', 0.003987477138102914),
 ('loose-envify', 0.003959896618027232)]

In [16]:
# Compare how many packages appear in both
rank_set = set([p for p,c in ranks[:1000]])
full_set = set([p for p,c in full_pagerank])
print(f"Number of common packages: {len(rank_set.intersection(full_set))}")

Number of common packages: 437


In [17]:
count = 0
for pkg, score in full_pagerank:
    if pkg not in pkg_space:
        count += 1
        print(pkg)
        
print(f"-------------\nCount = {count}")

yeoman-generator
@alifd/next
ember-cli-babel
superagent
ramda
coffee-script
aws-sdk
optimist
hoek
redis
yosay
angular
request-promise
datafire
@polymer/polymer
mysql
reflect-metadata
unique-random-array
styled-components
joi
@webcomponents/shadycss
path
isomorphic-fetch
fs
unique-random
readline-sync
ember-cli-htmlbars
@angular/core
node-sass
bignumber.js
@babel/polyfill
@babel/runtime-corejs2
fbjs
node-pre-gyp
passport-strategy
lit-html
@types/cordova
@angular/common
zone.js
xmldom
log4js
request-promise-native
md5
underscore.string
mustache
iterall
bunyan
cli-color
passport-oauth2
serialport
redis-commands
global
when
windows.foundation
@angular/platform-browser
boom
koa
event-stream
double-ended-queue
@angular/compiler
@angular/forms
requireindex
ncp
stealthy-require
oauth
user
request-promise-core
amqplib
formidable
lit-element
react-native
merge
react-transition-group
pkginfo
mqtt
@types/jquery
@angular/platform-browser-dynamic
stylus
babel-preset-env
@angular/http
@angular/router

## Pagerank with Weighted Nodes

In [18]:
weights = {p: data["download_count"] for p, data in feats.items()}
weights

{'@tsd/typescript': 15419169,
 'lodash._reinterpolate': 2003593317,
 '@floating-ui/react': 125057024,
 'from2-string': 20357660,
 'github-url-to-object': 9792148,
 'read-pkg': 10467844982,
 'indent-string': 8618054229,
 'generate-function': 1051448799,
 'eslint-plugin-mocha': 210783105,
 '@parcel/watcher-android-arm64': 25739596,
 'libnpmconfig': 144157515,
 '@lerna/conventional-commits': 253403730,
 'core-js': 10350460029,
 '@firebase/storage-compat': 147863312,
 'object-copy': 4188652811,
 'ajv-keywords': 8028408615,
 'retry-request': 969492258,
 'postgres-interval': 948557416,
 'dataloader': 711270037,
 'bundle-name': 309097249,
 'big-integer': 1593360749,
 '@vue/cli-overlay': 203765039,
 '@types/babel-template': 36535029,
 'cwd': 146960811,
 'dictionary-en': 1875230,
 'spawn-wrap': 620736264,
 'tempusdominus-core': 3925573,
 'jetifier': 141895707,
 'multiformats': 72185950,
 '@types/resolve-path': 650194,
 'window-size': 1985213042,
 'vite-plugin-vue-inspector': 25280040,
 '@stdlib

In [19]:
ranks = nx.pagerank(G, alpha=0.85, personalization=weights)
ranks = list(ranks.items())
ranks.sort(key=lambda x: x[1], reverse=True)
ranks[:20]

[('semver', 0.0018063074291368429),
 ('supports-color', 0.0017473821754180109),
 ('debug', 0.0017352304437429828),
 ('chalk', 0.001689278685079659),
 ('ansi-styles', 0.0016658981382414782),
 ('ms', 0.001548897043932515),
 ('has-flag', 0.0014587175239970727),
 ('color-convert', 0.0014162979558529159),
 ('color-name', 0.0014132859360030778),
 ('strip-ansi', 0.0014027288844484305),
 ('ansi-regex', 0.0013832343495757708),
 ('source-map', 0.0013536595777740034),
 ('glob', 0.0013033270357530845),
 ('commander', 0.00127877309599457),
 ('string-width', 0.0012665456644546944),
 ('readable-stream', 0.0012639541035785724),
 ('tslib', 0.0012579973941473238),
 ('minimatch', 0.001251172862593064),
 ('lru-cache', 0.001242788918877323),
 ('escape-string-regexp', 0.0012320014878189967)]

## Pagerank with Keywords

In [23]:
# define a function to calculate similarity scores based on keywords
def jaccard_similarity(a, b):
    a = set(a)
    b = set(b)
    return len(a & b) / len(a | b)

def pagerank_keywords(keyword_list: list, features, top=100, nstart_pkgs=[]):
    weights = {pkg: jaccard_similarity(data["keywords"], keyword_list)  for pkg, data in features.items()}

    scores = []
    nstart = []
    if nstart_pkgs != []:
        nstart = {pkg: 5 if pkg in nstart_pkgs else 0 for pkg in features.keys() }
        scores = nx.pagerank(G, alpha=0.85, personalization=weights, nstart=nstart)
    else:
        scores = nx.pagerank(G, alpha=0.0, personalization=weights)
    ranks = list(scores.items())
    # print(scores["react"])
    ranks.sort(key=lambda x: x[1], reverse=True)
    return ranks[:top]

pagerank_keywords([
        "admin",
        "admin-dashboard",
        "admin-template",
        "axios",
        "dashboard",
        "desktop",
        "element",
        "element-ui",
        "i18n",
        "management-system",
        "mock",
        "tinymce",
        "vue",
        "vue-admin",
        "vue-cli",
        "vuex",
        "webpack",
        "webpack4",
        "xlsx"
    ], feats)

[('@vue/cli-plugin-unit-mocha', 0.025227790131106965),
 ('vue-i18n', 0.015356046166760763),
 ('admin-lte', 0.013584194685980676),
 ('quasar', 0.012613895065553483),
 ('vue', 0.009294448995670987),
 ('schema-utils', 0.009294448995670987),
 ('file-loader', 0.009294448995670987),
 ('webpack-command', 0.009294448995670987),
 ('style-loader', 0.009294448995670987),
 ('@vue/compiler-sfc', 0.009294448995670987),
 ('@vue/shared', 0.009294448995670987),
 ('@vue/compiler-core', 0.009294448995670987),
 ('@vue/compiler-dom', 0.009294448995670987),
 ('@vue/compiler-ssr', 0.009294448995670987),
 ('@vue/reactivity', 0.009294448995670987),
 ('@vue/runtime-core', 0.009294448995670987),
 ('@vue/runtime-dom', 0.009294448995670987),
 ('@vue/server-renderer', 0.009294448995670987),
 ('raw-loader', 0.009294448995670987),
 ('imports-loader', 0.009294448995670987),
 ('karma-webpack', 0.009294448995670987),
 ('source-map-loader', 0.009294448995670987),
 ('exports-loader', 0.009294448995670987),
 ('@vue/reactiv

#### Augment graph with edges taken from project dependencies so that packages which appear together have an edge between them

In [27]:
def test_pagerank(project, search_terms):
    project_data = json.load(open("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json"))
    project_deps = list(project_data[project].keys())
    
    suggested_deps = [p for p,s in pagerank_keywords(project_deps, feats, nstart_pkgs=project_deps)]
    count = 0
    for d in suggested_deps:
        if d in project_deps:
            count += 1
    print(count)
    # print(suggested_deps)
    
test_pagerank("vue-element-admin", [
        "admin",
        "admin-dashboard",
        "admin-template",
        "axios",
        "dashboard",
        "desktop",
        "element",
        "element-ui",
        "i18n",
        "management-system",
        "mock",
        "tinymce",
        "vue",
        "vue-admin",
        "vue-cli",
        "vuex",
        "webpack",
        "webpack4",
        "xlsx"
    ])

8


## Pagerank with Description

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(string1, string2):
    vectorizer = TfidfVectorizer()
    
    tfidf_matrix = vectorizer.fit_transform([string1, string2])
    
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return similarity

def pagerank_description(desc: str, features, top=100):
    weights = {pkg: compute_similarity(data["description"], desc)  for pkg, data in features.items()}

    scores = nx.pagerank(G, alpha=0.05, personalization=weights)
    ranks = list(scores.items())
    print(scores["react"])
    ranks.sort(key=lambda x: x[1], reverse=True)
    return ranks[:top]

pagerank_description(':tada: A magical vue admin', feats)

1.9282602533617736e-07


[('@vue/compat', 0.010448450041348263),
 ('@vue/reactivity', 0.009768515569990324),
 ('@strapi/admin', 0.00954382495935774),
 ('vue-router', 0.00891760348747064),
 ('@vue/compiler-sfc', 0.007782016829624084),
 ('@vue/compiler-dom', 0.0076883764970911974),
 ('@vue/runtime-dom', 0.007641998160956878),
 ('@vue/compiler-core', 0.007608993594030151),
 ('@vue/compiler-ssr', 0.007594015172613479),
 ('@vue/runtime-core', 0.007589247069952104),
 ('@intlify/vue-i18n-extensions', 0.007519486266899021),
 ('@vue/server-renderer', 0.007512828244070423),
 ('vue-jest', 0.007491931495056499),
 ('@vitejs/plugin-vue', 0.007482651004727015),
 ('vue-codemod', 0.007470379090831667),
 ('@vue/reactivity-transform', 0.007450890534216568),
 ('@storybook/vue3', 0.007435337929129417),
 ('docsify', 0.007404747733468548),
 ('@storybook/vue', 0.00740076346111362),
 ('@loki/integration-vue', 0.007399667783852435),
 ('vue-cli-plugin-mockjs', 0.00739861911358024),
 ('mavon-editor', 0.00739825402980637),
 ('@vue/vue3-je