In [26]:
import json

import pandas as pd
import requests
from tqdm.notebook import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [97]:
def filter_valid_package_names(packages):
    packages = set(packages)
    remove = set()
    for package in packages:
        if (not package.startswith('@')) and (not package[0].islower()):
            remove.add(package)
                
    return packages - remove

# test function
print(filter_valid_package_names({'../cli', 'axios', '@blah/blah', '345'}))

{'axios', '@blah/blah'}


In [98]:
def create_package_space_from_project_json(filepath):
    package_space = set()
    project_data = json.load(open(filepath))
    for _, deps in project_data.items():
        for pkg in deps.keys():
            if pkg.startswith("npm:"):
                pkg = pkg[4:]
            package_space.add(pkg)
            
    return package_space

In [28]:
from bs4 import BeautifulSoup
import os

def create_package_list():
    res = requests.get("https://gist.github.com/anvaka/8e8fa57c7ee1350e3491")
    if res.status_code == 200:
        parsed_html = BeautifulSoup(res.content, "html.parser")
        selector = "#gistcomment-4447488 > div.edit-comment-hide > task-lists > div > p:nth-child(3)"
        package_list = parsed_html.select(selector)
        with open("./playground/package_list.txt", "w") as f:
            f.write(package_list[0].text)
            
def load_package_list():
    if not os.path.exists("./playground/package_list.txt"):
        create_package_list()
    else:
        with open("./playground/package_list.txt", "r") as f:
            return [text for text in f.read().splitlines()]
    
def create_package_space_from_github_gist():
    # packages = load_package_list()
    # print(len(packages))
    return set(load_package_list())
    
                
print(len(create_package_space_from_github_gist()))

5248


In [29]:
pkg_space = create_package_space_from_project_json("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json")
print("Number of packages:", len(pkg_space))

Number of packages: 16784


In [31]:
from datetime import datetime, timedelta

def convert_time_to_epoch(timestamp):
    timestamp = timestamp.replace('Z', '+00:00')
    timestamp = datetime.fromisoformat(timestamp)
    return datetime.strftime(timestamp, '%s')
    

In [42]:
def fetch_package_data(package, base_url="https://registry.npmjs.org/", max_retries=3):
    for attempt in range(max_retries):
        try:
            res = requests.get(base_url + package)
            data = res.json()
            
            if res.status_code == 200:
                if 'dist-tags' not in data:
                    print(f"{package} is unpublished")
                    return {}
            
                latest_version = data["dist-tags"]["latest"]
                
                package_details = {
                    "latest_version": latest_version,
                    "keywords": data.get("keywords", []),
                    "publish_time": convert_time_to_epoch(data["time"][latest_version]),
                    "description": data.get("description", ""),
                    "creation_date": data["time"]["created"],
                    "number_of_versions": len(data["versions"]),
                    "dependencies": data["versions"][latest_version].get("dependencies", {}), 
                    "devDependencies": data["versions"][latest_version].get("devDependencies", {}), 
                    "peerDependencies": data["versions"][latest_version].get("peerDependencies", {}), 
                }
                
                return package_details
            
            else:
                print(f"{package} failed with status code {res.status_code}")
                return {}
                
        except requests.exceptions.ConnectionError as ce:
            if attempt < max_retries - 1:
                print(f"Failed to fetch {package} due to connection error: attempt {attempt + 1}")
            else:
                print("Failed to fetch {package}: max attempts exceeded")
                return {}
        except Exception as e:
            print("EXCEPTION OCCURED", package, e)

In [99]:
fetch_package_data("@types/recordrtc")

{'latest_version': '5.6.14',
 'keywords': [],
 'publish_time': '1699340797',
 'description': 'TypeScript definitions for recordrtc',
 'creation_date': '2020-08-06T00:41:07.911Z',
 'number_of_versions': 15,
 'dependencies': {},
 'devDependencies': {},
 'peerDependencies': {}}

In [44]:
def limit_package_space(full_space, n):
    limit_space = set()
    for _ in range(n):
        limit_space.add(full_space.pop())
    return limit_space

In [100]:
def build_package_space(max_workers=20, max_iters=2, test_mode=False, src='project'):
    
    package_space = []
    
    if src == 'github':
        package_space = create_package_space_from_github_gist()
    elif src == 'project':
        package_space = create_package_space_from_project_json("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json")

    
    if test_mode:
        package_space = limit_package_space(package_space, 20)
        
    package_space = filter_valid_package_names(package_space)
    
    fetched = set()
    package_features = dict()
    dep_graph = dict()
    
    feat_lock = threading.Lock()
    graph_lock = threading.Lock()
    fetched_lock = threading.Lock()
    
    data_fields = [
        "latest_version",
        "keywords",
        "publish_time",
        "description",
        "creation_date", 
        "number_of_versions"
    ]
    
    # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
    # ----------------------------------------------------------------
    # dep_types = [
    #     ("dependencies", "PROD"), 
    #     ("devDependencies", "DEV"), 
    #     ("peerDependencies", "PEER")
    # ]
    
    next_iter = set()
    next_iter.update(package_space)
    # print(next_iter)
    
    curr_iter = 1
    
    while (max_iters == -1 and len(next_iter) > 0) or curr_iter <= max_iters:
        package_set, next_iter = next_iter, set()
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = { executor.submit(fetch_package_data, package): package for package in package_set }
            
            try: 
                for future in tqdm(as_completed(futures), total=len(futures), desc=f'Iteration {curr_iter}'):
                    pkg = futures[future]
                    package_data = future.result()
                    
                    with fetched_lock:
                        fetched.add(pkg)
                    
                    if package_data != {}:
                        with feat_lock, graph_lock:
                            package_features[pkg] = {field: package_data[field] for field in data_fields}
                            deps = {d: package_data["dependencies"][d] for d in filter_valid_package_names(package_data["dependencies"])}
                            dep_graph[pkg] = deps
                            next_iter.update(deps.keys())
    
                            # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
                            # ----------------------------------------------------------------
                            # dep = dict()
                            # for d_type, code in dep_types:
                            #     for d,v in package_data[d_type].items():
                            #         dep[d] = (v, code)
                            #         
                            # dep_graph[pkg] = dep
                            # 
                            # next_iter.update(package_data["dependencies"].keys())
                            # next_iter.update(package_data["devDependencies"].keys())
                            # next_iter.update(package_data["peerDependencies"].keys())
                            
                            
            except Exception as e:
                print(e)
                
        next_iter.difference_update(fetched)
        
        print(f"Next Iteration - {curr_iter + 1}: {len(next_iter)} packages")
        curr_iter += 1
    
    return package_features, dep_graph
            

In [101]:
feats, graph = build_package_space(max_iters=-1)

Iteration 1:   0%|          | 0/16773 [00:00<?, ?it/s]

@hcengineering/task failed with status code 404
@vue/test-utils-vue3 failed with status code 404
@dataspherestudio/shared failed with status code 404
@hcengineering/model-chunter failed with status code 404
@hcengineering/print-assets failed with status code 404
@hcengineering/highlight failed with status code 404
@hcengineering/presentation failed with status code 404
@hcengineering/desktop-preferences failed with status code 404
@hcengineering/login-resources failed with status code 404
@tryghost/staff-service failed with status code 404
pdfjs-dist-v4 failed with status code 404
@hcengineering/request-resources failed with status code 404
@hcengineering/server-recruit failed with status code 404
@hcengineering/model-inventory failed with status code 404
@tryghost/adapter-cache-redis failed with status code 404
@hcengineering/model-tracker failed with status code 404
@hcengineering/training failed with status code 404
@hcengineering/notification failed with status code 404
@tryghost/l

Iteration 2:   0%|          | 0/1326 [00:00<?, ?it/s]

cbw-sdk failed with status code 404
@azure/functions-old failed with status code 404
@babel/traverse--for-generate-function-map failed with status code 404
@vue/vue-loader-v15 failed with status code 404
Next Iteration - 3: 604 packages


Iteration 3:   0%|          | 0/604 [00:00<?, ?it/s]

@stdlib/complex-float64-base-assert failed with status code 404
typescript-5.7 failed with status code 404
typescript-5.5 failed with status code 404
typescript-5.8 failed with status code 404
typescript-5.4 failed with status code 404
typescript-5.0 failed with status code 404
typescript-5.1 failed with status code 404
typescript-5.3 failed with status code 404
typescript-4.9 failed with status code 404
typescript-5.6 failed with status code 404
typescript-5.2 failed with status code 404
Next Iteration - 4: 208 packages


Iteration 4:   0%|          | 0/208 [00:00<?, ?it/s]

Next Iteration - 5: 35 packages


Iteration 5:   0%|          | 0/35 [00:00<?, ?it/s]

Next Iteration - 6: 21 packages


Iteration 6:   0%|          | 0/21 [00:00<?, ?it/s]

Next Iteration - 7: 3 packages


Iteration 7:   0%|          | 0/3 [00:00<?, ?it/s]

Next Iteration - 8: 1 packages


Iteration 8:   0%|          | 0/1 [00:00<?, ?it/s]

Next Iteration - 9: 0 packages


In [102]:
# divide into scoped and unscoped packages
scoped = []
unscoped = []
for package in feats.keys():
    if package.find("@") != -1:
        scoped.append((package, feats[package]["creation_date"]))
    else:
        unscoped.append(package)
print(f'scoped: {len(scoped)}\tunscoped: {len(unscoped)}')

scoped: 7672	unscoped: 10867


In [103]:
chunks = []
for i in range(0, len(unscoped), 128):
    chunks.append(unscoped[i:i + 128])
print(len(chunks))

85


In [105]:
# CAN GET DOWNLOAD COUNTS FOR UNSCOPED PACKAGES IN BULK

def get_all_time_download_counts_bulk(packages, format="%Y-%m-%d"):
    start_date = datetime(day=10, month=1, year=2015).date()
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=365), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
        
    download_counts = {package: 0 for package in packages}
    for interval in intervals:
        url = f"https://api.npmjs.org/downloads/point/{interval}/{','.join(packages)}"
        res = requests.get(url)
        if res.status_code == 200:
            download_data = res.json()
            for package in packages:
                try:
                    download_counts[package] += download_data[package]["downloads"]
                except Exception as e:
                    print(f"[ERROR] getting download count for {package}")
    
    return download_counts
    

In [106]:
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = { executor.submit(get_all_time_download_counts_bulk, chunk): chunk for chunk in chunks }
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Getting download counts for unscoped packages"):
        for package, count in future.result().items():
            feats[package]["download_count"] = count

Getting download counts for unscoped packages:   0%|          | 0/85 [00:00<?, ?it/s]

In [107]:
print(f'{feats["semver"]["download_count"]:,}')

50,318,547,289


In [108]:
# FOR UNSCOPED, update download function to process one package concurrently
def get_year_downloads(package, interval):
    url = f"https://api.npmjs.org/downloads/point/{interval}/{package}"
    res = requests.get(url)
    if res.status_code == 200:
        return res.json()["downloads"]
    
    return 0


def get_all_time_download_count_concurrent(package, creation_date, format="%Y-%m-%d", max_workers=3):
    create_date = creation_date.replace('Z', '+00:00')
    create_date = datetime.fromisoformat(create_date)
    
    start_date = max(datetime(day=10, month=1, year=2015).date(), create_date.date())
    curr_date = datetime.now().date()
    
    intervals = []
    while start_date < curr_date:
        interval_end = min(start_date + timedelta(days=547), curr_date)
        intervals.append(f'{start_date.strftime(format)}:{interval_end.strftime(format)}')
        start_date = interval_end + timedelta(days=1)
            
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_year_downloads, package, interval): interval for interval in intervals }
        download_count = 0

        for future in futures:
            download_count += future.result()

        return download_count

# test function
count = get_all_time_download_count_concurrent("semver", "2011-02-12T00:20:25.690Z")
print(f'{count:,}')
    

50,312,083,152


In [109]:
def get_package_download_count(package, date, max_retries=3, max_workers=3):
    for attempt in range(max_retries):
        try:
            download_count = get_all_time_download_count_concurrent(package, date, max_workers=max_workers)
            return download_count
        except requests.exceptions.ConnectionError as e:
            if attempt < max_retries - 1:
                print(f'Failed getting count for {package}: attempt {attempt + 1}')
            else:
                print(f'Failed getting count for {package}: max retries exceeded')
    return -1

def get_download_counts_scoped_concurrent(packages, max_retries=3, max_workers=2, max_sub_workers=3):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(get_package_download_count, package, date, max_workers=max_sub_workers): package for package, date in packages }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Scoped Packages"):
            package = futures[future]
            download_count = future.result()
            feats[package]["download_count"] = download_count
            

In [112]:
get_download_counts_scoped_concurrent(scoped, max_workers=4, max_sub_workers=3)

Scoped Packages:   0%|          | 0/7672 [00:00<?, ?it/s]

In [113]:
with open("./dataset/dep-graph.json", mode="w") as file:
    json.dump(graph, file, indent=4)

In [114]:
with open("./dataset/package-features.json", mode="w") as file:
    json.dump(feats, file, indent=4)

In [None]:
def save_dataset(features, edges, feature_filename="./dataset/package-features.json", edges_filename="./dataset/dep-graph.json"):
    with open(feature_filename, "w") as feat_file:
        print(f"Saving features to {feature_filename}")
        json.dump(features, feat_file, indent=4)
    with open(edges_filename, "w") as edge_file:
        print(f"Saving edges to {edges_filename}")
        json.dump(edges, edge_file, indent=4)
    

In [115]:
print(len(feats.keys()))
print(len(graph.keys()))

18539
18539


In [116]:
edges = []

for package, deps in graph.items():
    if len(deps) > 0:
        for dep, version in deps.items():
            edges.append((package, dep, version))

headers = ['package', 'dependency', 'version']
df = pd.DataFrame(edges, columns=headers)
df.head()

Unnamed: 0,package,dependency,version
0,@floating-ui/react,@floating-ui/utils,^0.2.8
1,@floating-ui/react,tabbable,^6.0.0
2,@floating-ui/react,@floating-ui/react-dom,^2.1.2
3,from2-string,from2,^2.0.3
4,github-url-to-object,is-url,^1.1.0


In [117]:
len(df)

53208

In [59]:
df.to_csv("./dataset/dep-graph.csv", index=False)

In [None]:
feat_data = []
headers = ['package', 'latest_version', 'keywords', 'publish_time', 'description', 'creation_date', 'number_of_versions', 'download_count']
for package, feats in feats.items():
    feat_data.append([package] + list(feats.values()))

feat_df = pd.DataFrame(feat_data, columns=headers)
feat_df.head()

In [21]:
feat_df.to_csv("./dataset/package-feats.csv", index=False)

In [73]:
len(feat_df)

12370

## Pagerank

In [60]:
import pandas as pd
import networkx as nx
from scipy.sparse import csr_matrix

dep_df = pd.read_csv("./dataset/dep-graph.csv")

In [61]:
sources = set(dep_df['package'].unique())
deps = set(dep_df['dependency'].unique())
pkg_space = sources.union(deps)

print("Unique packages in packages column", len(sources))
print("Unique packages in dependency column", len(deps))
print("Total unique", len(pkg_space))

Unique packages in packages column 11717
Unique packages in dependency column 13220
Total unique 16568


In [62]:
no_out = {p for p, d in graph.items() if len(d) == 0}
print("Nodes with 0 out-degree:", len(no_out))

with_in = set()
for dependencies in graph.values():
    with_in.update(dependencies)

no_in = no_out - with_in

print("Nodes with 0 in-degree and 0 out-degree:", len(no_in))

json_str = json.dumps(json.load(open("./playground/dep-graph.json")))
from_original = [p for p in no_in if json_str.find(p) != -1]
print("Nodes with 0 in- and out-degree from source:", len(from_original))

Nodes with 0 out-degree: 6833
Nodes with 0 in-degree and 0 out-degree: 1999
Nodes with 0 in- and out-degree from source: 295


In [147]:
# build adjacency matrix
G = nx.from_pandas_edgelist(dep_df, source='package', target='dependency', create_using=nx.Graph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

Adjacency Matrix shape: (16568, 16568)
G: Graph with 16568 nodes and 53258 edges


In [148]:
# Perform Pagerank
ranks = nx.pagerank(G, alpha=0.85)
ranks = list(ranks.items())
ranks.sort(key=lambda x: x[1], reverse=True)
ranks[:10]

[('tslib', 0.005779945670985727),
 ('lodash', 0.004425281934455987),
 ('@babel/runtime', 0.003919284586123476),
 ('debug', 0.0034820792545919264),
 ('chalk', 0.003448821614934509),
 ('semver', 0.002818809443048703),
 ('@types/node', 0.002583595635643175),
 ('@twilio-paste/core', 0.0024559105426107015),
 ('prop-types', 0.0022969511182988275),
 ('fs-extra', 0.0019448800436943)]

In [149]:
from bs4 import BeautifulSoup

res = requests.get("https://gist.github.com/anvaka/8e8fa57c7ee1350e3491")

full_pagerank = []

if res.status_code != 200:
    print("Request failed")
else:
    parsed_html = BeautifulSoup(res.content, "html.parser")
    github_pagerank = parsed_html.select("#file-03-pagerank-md-readme > article > ol > li")
    full_pagerank = [(html.find('a').text, float(html.text.split(" - ")[1])) for html in github_pagerank]
    
full_pagerank[:10]

[('lodash', 0.010834060106227733),
 ('tslib', 0.0073019517213535595),
 ('request', 0.0057722307215751405),
 ('debug', 0.0054921316717398815),
 ('prop-types', 0.005007180072168596),
 ('ms', 0.004915039436489446),
 ('chalk', 0.004908854475195843),
 ('object-assign', 0.004052459060484455),
 ('minimist', 0.003987477138102914),
 ('loose-envify', 0.003959896618027232)]

In [150]:
# Compare how many packages appear in both
rank_set = set([p for p,c in ranks[:1000]])
full_set = set([p for p,c in full_pagerank])
print(f"Number of common packages: {len(rank_set.intersection(full_set))}")

Number of common packages: 308


In [151]:
count = 0
for pkg, score in full_pagerank:
    if pkg not in pkg_space:
        count += 1
        print(pkg)
        
print(f"-------------\nCount = {count}")

nan
@alifd/next
async-limiter
angular
datafire
@polymer/polymer
unique-random-array
fs
concat-map
unique-random
big.js
emojis-list
@types/cordova
serialport
redis-commands
when
windows.foundation
double-ended-queue
user
amqplib
@angular/http
web3
xmlhttprequest
require-main-filename
which-module
babel
@linclark/pkg
keypress
gulp-rename
@sailshq/lodash
elasticsearch
@phosphor/algorithm
has-binary2
http
ssh2-streams
babel-plugin-transform-runtime
diagnostics
string
vue-hot-reload-api
restify
ethereumjs-util
strict-uri-encode
hexlet-pairs
hapi
noble
soap
get-func-name
nice-try
material-ui
alfy
nedb
child_process
string-template
grpc
wrench
unzip
aurelia-pal
indexof
ref
tslint-eslint-rules
ffi
unirest
detect-conflict
babelify
-------------
Count = 64


## Pagerank with Weighted Nodes

In [152]:
weights = {p: data["download_count"] for p, data in feats.items()}
weights

{'@tsd/typescript': 15419169,
 'lodash._reinterpolate': 2003593317,
 '@floating-ui/react': 125057024,
 'from2-string': 20357660,
 'github-url-to-object': 9792148,
 'read-pkg': 10467844982,
 'indent-string': 8618054229,
 'generate-function': 1051448799,
 'eslint-plugin-mocha': 210783105,
 '@parcel/watcher-android-arm64': 25739596,
 'libnpmconfig': 144157515,
 '@lerna/conventional-commits': 253403730,
 'core-js': 10350460029,
 '@firebase/storage-compat': 147863312,
 'object-copy': 4188652811,
 'ajv-keywords': 8028408615,
 'retry-request': 969492258,
 'postgres-interval': 948557416,
 'dataloader': 711270037,
 'bundle-name': 309097249,
 'big-integer': 1593360749,
 '@vue/cli-overlay': 203765039,
 '@types/babel-template': 36535029,
 'cwd': 146960811,
 'dictionary-en': 1875230,
 'spawn-wrap': 620736264,
 'tempusdominus-core': 3925573,
 'jetifier': 141895707,
 'multiformats': 72185950,
 '@types/resolve-path': 650194,
 'window-size': 1985213042,
 'vite-plugin-vue-inspector': 25280040,
 '@stdlib

In [153]:
ranks = nx.pagerank(G, alpha=0.85, personalization=weights)
ranks = list(ranks.items())
ranks.sort(key=lambda x: x[1], reverse=True)
ranks[:20]

[('chalk', 0.004833755254636757),
 ('tslib', 0.004592781984897626),
 ('debug', 0.004490075911121394),
 ('semver', 0.003964439118642751),
 ('lodash', 0.003658907331253371),
 ('@babel/helper-plugin-utils', 0.0029487838735198623),
 ('@babel/runtime', 0.002490371726200329),
 ('@types/node', 0.002472840027948716),
 ('fs-extra', 0.0023466741780323276),
 ('glob', 0.0020093308794086905),
 ('@babel/preset-env', 0.0018949235882510852),
 ('minimatch', 0.0018675903957154063),
 ('yargs', 0.0018598793057386328),
 ('@babel/types', 0.0017612211300281557),
 ('readable-stream', 0.001715740544749124),
 ('commander', 0.0016514039693861899),
 ('@babel/core', 0.0015866077232691833),
 ('gatsby', 0.0015824285617362026),
 ('es-abstract', 0.0015398630326599637),
 ('strip-ansi', 0.0014943765681296281)]

## Pagerank with Keywords

In [164]:
# define a function to calculate similarity scores based on keywords
def jaccard_similarity(a, b):
    a = set(a)
    b = set(b)
    return len(a & b) / len(a | b)

def pagerank_keywords(keyword_list: list, features, top=100):
    weights = {pkg: jaccard_similarity(data["keywords"], keyword_list)  for pkg, data in features.items()}

    scores = nx.pagerank(G, alpha=0.0, personalization=weights)
    ranks = list(scores.items())
    print(scores["react"])
    ranks.sort(key=lambda x: x[1], reverse=True)
    return ranks[:top]

pagerank_keywords(['express'], feats)

0.0


[('conventional-changelog-express', 0.03087235268721656),
 ('brute-knex', 0.03087235268721656),
 ('express-logging', 0.03087235268721656),
 ('serve-favicon', 0.03087235268721656),
 ('launch-editor-middleware', 0.03087235268721656),
 ('express-ws', 0.03087235268721656),
 ('r19', 0.03087235268721656),
 ('path-to-regexp', 0.023154264515412422),
 ('cors', 0.023154264515412422),
 ('morgan', 0.023154264515412422),
 ('i18next-http-middleware', 0.023154264515412422),
 ('i18next-express-middleware', 0.023154264515412422),
 ('connect-redis', 0.023154264515412422),
 ('altair-express-middleware', 0.023154264515412422),
 ('exegesis-express', 0.023154264515412422),
 ('csurf', 0.023154264515412422),
 ('cookie-session', 0.023154264515412422),
 ('csrf-sync', 0.023154264515412422),
 ('connect-flash', 0.023154264515412422),
 ('koa-connect', 0.023154264515412422),
 ('http-status', 0.018523411612329937),
 ('express-hbs', 0.018523411612329937),
 ('liquidjs', 0.018523411612329937),
 ('passport', 0.0185234116

## Pagerank with Description

In [162]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(string1, string2):
    vectorizer = TfidfVectorizer()
    
    tfidf_matrix = vectorizer.fit_transform([string1, string2])
    
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return similarity

def pagerank_description(desc: str, features, top=100):
    weights = {pkg: compute_similarity(data["description"], desc)  for pkg, data in features.items()}

    scores = nx.pagerank(G, alpha=0.05, personalization=weights)
    ranks = list(scores.items())
    print(scores["react"])
    ranks.sort(key=lambda x: x[1], reverse=True)
    return ranks[:top]

pagerank_description('node app for auth and security', feats)

0.00010828704387129419


[('@types/node', 0.0008919903307305153),
 ('tslib', 0.0006340140680164201),
 ('debug', 0.0005167548899455916),
 ('basic-auth-connect', 0.000511249919315825),
 ('@readme/openapi-parser', 0.0004242950119159201),
 ('@apidevtools/swagger-parser', 0.00042301224613231056),
 ('prop-types', 0.00041943908819866183),
 ('@types/react', 0.0004044762072799767),
 ('@embroider/util', 0.0003872367498357537),
 ('unxhr', 0.00038640995275195106),
 ('node-datachannel', 0.00038535111621038445),
 ('formdata-polyfill', 0.0003839235201680398),
 ('@bugsnag/cuid', 0.0003828071332777226),
 ('gm', 0.00038181157690354275),
 ('cuid', 0.0003812712640262451),
 ('tar', 0.0003762475952506643),
 ('react-scripts', 0.0003649168075339614),
 ('@sanity/eventsource', 0.00036290415171413295),
 ('xmlhttprequest-ssl', 0.00036255450460453644),
 ('semaphore', 0.0003624827195622904),
 ('vow', 0.00036180828811885275),
 ('@opentelemetry/instrumentation-http', 0.0003555531384335659),
 ('process', 0.00035522291680798964),
 ('raf', 0.00