In [31]:
import json
import math
from collections import defaultdict

import pandas as pd
import requests
from tqdm.notebook import tqdm
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
def filter_valid_package_names(packages):
    packages = set(packages)
    remove = set()
    for package in packages:
        if (not package.startswith('@')) and (not package[0].islower()):
            remove.add(package)
                
    return packages - remove

# test function
print(filter_valid_package_names({'../cli', 'axios', '@blah/blah', '345'}))

{'axios', '@blah/blah'}


In [3]:
def create_package_space_from_project_json(filepath):
    package_space = set()
    project_data = json.load(open(filepath))
    for _, deps in project_data.items():
        for pkg in deps.keys():
            if pkg.startswith("npm:"):
                pkg = pkg[4:]
            package_space.add(pkg)
            
    return package_space

In [4]:
from bs4 import BeautifulSoup
import os

def create_package_list():
    res = requests.get("https://gist.github.com/anvaka/8e8fa57c7ee1350e3491")
    if res.status_code == 200:
        parsed_html = BeautifulSoup(res.content, "html.parser")
        selector = "#gistcomment-4447488 > div.edit-comment-hide > task-lists > div > p:nth-child(3)"
        package_list = parsed_html.select(selector)
        with open("./playground/package_list.txt", "w") as f:
            f.write(package_list[0].text)
            
def load_package_list():
    if not os.path.exists("./playground/package_list.txt"):
        create_package_list()
    else:
        with open("./playground/package_list.txt", "r") as f:
            return [text for text in f.read().splitlines()]
    
def create_package_space_from_github_gist():
    # packages = load_package_list()
    # print(len(packages))
    return set(load_package_list())
    
                
print(len(create_package_space_from_github_gist()))

5248


In [5]:
pkg_space = create_package_space_from_project_json("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json")
print("Number of packages:", len(pkg_space))

Number of packages: 16784


In [6]:
from datetime import datetime, timedelta

def convert_time_to_epoch(timestamp):
    timestamp = timestamp.replace('Z', '+00:00')
    timestamp = datetime.fromisoformat(timestamp)
    return datetime.strftime(timestamp, '%s')
    

In [33]:
def fetch_package_data(package, base_url="https://registry.npmjs.org/", max_retries=3):
    for attempt in range(max_retries):
        try:
            res = requests.get(base_url + package)
            data = res.json()
            
            if res.status_code == 200:
                if 'dist-tags' not in data:
                    print(f"{package} is unpublished")
                    return {}
            
                latest_version = data["dist-tags"]["latest"]
                
                package_details = {
                    "latest_version": latest_version,
                    "keywords": data.get("keywords", []),
                    "publish_time": convert_time_to_epoch(data["time"][latest_version]),
                    "description": data.get("description", ""),
                    "creation_date": data["time"]["created"],
                    "number_of_versions": len(data["versions"]),
                    "dependencies": data["versions"][latest_version].get("dependencies", {}), 
                    "devDependencies": data["versions"][latest_version].get("devDependencies", {}), 
                    "peerDependencies": data["versions"][latest_version].get("peerDependencies", {}), 
                }
                
                return package_details
            
            else:
                # print(f"{package} failed with status code {res.status_code}")
                return -res.status_code
                
        except requests.exceptions.ConnectionError as ce:
            if attempt < max_retries - 1:
                print(f"Failed to fetch {package} due to connection error: attempt {attempt + 1}")
            else:
                print("Failed to fetch {package}: max attempts exceeded")
                return {}
        except Exception as e:
            print("EXCEPTION OCCURED", package, e)

In [34]:
fetch_package_data("@types/recordrtc")

{'latest_version': '5.6.14',
 'keywords': [],
 'publish_time': '1699340797',
 'description': 'TypeScript definitions for recordrtc',
 'creation_date': '2020-08-06T00:41:07.911Z',
 'number_of_versions': 15,
 'dependencies': {},
 'devDependencies': {},
 'peerDependencies': {}}

In [35]:
def limit_package_space(full_space, n):
    limit_space = set()
    for _ in range(n):
        limit_space.add(full_space.pop())
    return limit_space

In [10]:
# def clean_graph(graph, valid):
#     for p in graph.keys():
#         if p not in valid:
#             del graph[p]
#             
#     for p, deps in graph.items():
#         for d, v  in deps:
#             if d not in valid:
#                 del graph[p][d]
        

In [46]:
def build_package_space(max_workers=20, max_iters=2, test_mode=False, src='project'):
    package_space = []
    iter_log = defaultdict(list)
    
    if src == 'github':
        package_space = create_package_space_from_github_gist()
    elif src == 'project':
        package_space = create_package_space_from_project_json("./GithubScrape/Data/40_With_Description/append_dependencies_40_4.json")
        
    
    if test_mode:
        package_space = limit_package_space(package_space, 20)
        
    package_space = filter_valid_package_names(package_space)
    
    fetched = set()
    package_features = dict()
    dep_graph = dict()
    
    feat_lock = threading.Lock()
    graph_lock = threading.Lock()
    fetched_lock = threading.Lock()
    
    data_fields = [
        "latest_version",
        "keywords",
        "publish_time",
        "description",
        "creation_date", 
        "number_of_versions"
    ]
    
    # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
    # ----------------------------------------------------------------
    # dep_types = [
    #     ("dependencies", "PROD"), 
    #     ("devDependencies", "DEV"), 
    #     ("peerDependencies", "PEER")
    # ]
    
    next_iter = set()
    next_iter.update(package_space)
    # print(next_iter)
    
    curr_iter = 1
    
    while (max_iters == -1 and len(next_iter) > 0) or curr_iter <= max_iters:
        package_set, next_iter = next_iter, set()
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = { executor.submit(fetch_package_data, package): package for package in package_set }
            
            try: 
                for future in tqdm(as_completed(futures), total=len(futures), desc=f'Iteration {curr_iter}'):
                    pkg = futures[future]
                    package_data = future.result()
                    
                    with fetched_lock:
                        fetched.add(pkg)
                        
                    if type(package_data) == type(0) and package_data < 0:
                        iter_log[curr_iter].append(f'{pkg} failed with status code {package_data}')
                    
                    elif package_data != {}:
                        with feat_lock, graph_lock:
                            package_features[pkg] = {field: package_data[field] for field in data_fields}
                            deps = {d: package_data["dependencies"][d] for d in filter_valid_package_names(package_data["dependencies"])}
                            dep_graph[pkg] = list(deps.keys())
                            next_iter.update(deps.keys())
    
                            # THIS CODE IS FOR IF WE WANT TO INCLUDE MULTIPLE DEPENDENCY TYPES
                            # ----------------------------------------------------------------
                            # dep = dict()
                            # for d_type, code in dep_types:
                            #     for d,v in package_data[d_type].items():
                            #         dep[d] = (v, code)
                            #         
                            # dep_graph[pkg] = dep
                            # 
                            # next_iter.update(package_data["dependencies"].keys())
                            # next_iter.update(package_data["devDependencies"].keys())
                            # next_iter.update(package_data["peerDependencies"].keys())
                        
            except Exception as e:
                # print(e)
                iter_log[curr_iter].append(e)
                
        next_iter.difference_update(fetched)
        
        print(f"Failed in current iteration - {curr_iter + 1}: {len(iter_log[curr_iter])} packages. Check log.txt for details")
        print(f"Next Iteration - {curr_iter + 1}: {len(next_iter)} packages")
        curr_iter += 1
        
    # Output log
    with open("log.txt", "w") as f:
        for i in range(1,curr_iter):
            f.write(f"============= Iteration {i} =============\n")
            f.writelines('\n'.join(iter_log[i-1]))
            
    return package_features, dep_graph
            

In [47]:
feats, graph = build_package_space(max_iters=-1)

Iteration 1:   0%|          | 0/19359 [00:00<?, ?it/s]

Failed to fetch express-http-proxy due to connection error: attempt 1
install-test is unpublished
Failed in current iteration - 2: 490 packages. Check log.txt for details
Next Iteration - 2: 1358 packages


Iteration 2:   0%|          | 0/1358 [00:00<?, ?it/s]

Failed in current iteration - 3: 4 packages. Check log.txt for details
Next Iteration - 3: 581 packages


Iteration 3:   0%|          | 0/581 [00:00<?, ?it/s]

Failed in current iteration - 4: 11 packages. Check log.txt for details
Next Iteration - 4: 160 packages


Iteration 4:   0%|          | 0/160 [00:00<?, ?it/s]

Failed in current iteration - 5: 0 packages. Check log.txt for details
Next Iteration - 5: 29 packages


Iteration 5:   0%|          | 0/29 [00:00<?, ?it/s]

Failed in current iteration - 6: 0 packages. Check log.txt for details
Next Iteration - 6: 16 packages


Iteration 6:   0%|          | 0/16 [00:00<?, ?it/s]

Failed in current iteration - 7: 0 packages. Check log.txt for details
Next Iteration - 7: 1 packages


Iteration 7:   0%|          | 0/1 [00:00<?, ?it/s]

Failed in current iteration - 8: 0 packages. Check log.txt for details
Next Iteration - 8: 0 packages


In [48]:
def retryable_request(func, *args, max_retries=10, fail_value=None, **kwargs,):
    for i in range(max_retries):
        try:
            return func(*args, **kwargs)
        except requests.exceptions.ConnectionError as ce:
            if i < max_retries - 1:
                print(f"Failed request {func} {args} {kwargs} due to connection error: attempt {i + 1}")
            else:
                print(f"Failed to fetch {package}: max attempts exceeded")
                return fail_value

def get_download_counts_bulk(packages):
    url = f"https://api.npmjs.org/downloads/point/last-year/{','.join(packages)}"
    res = requests.get(url)
    
    if res.status_code == 200:
        data = res.json()
        return [(p, d['downloads']) for p,d in data.items()]
        
    else:
        print(f"[ERROR] failed for chunk {packages} with status code {res.status_code}")
        return -1
        
get_download_counts_bulk(["semver", "axios"])

def get_download_count(package):
    url = f"https://api.npmjs.org/downloads/point/last-year/{package}"
    res = requests.get(url)
    
    if res.status_code == 200:
        data = res.json()
        return data['downloads']
    
    else:
        print(f"[ERROR] failed for package {package} with status code {res.status_code}")
        return -1
      
# Test  
print(get_download_counts_bulk(["semver", "axios"]))
print(get_download_count("semver"))

retryable_request(get_download_count, "semver")

[('semver', 16072016131), ('axios', 2562406661)]
16072016131


16072016131

In [49]:
# divide into scoped and unscoped packages
def fetch_download_counts_unscoped(packages, max_workers=5):
    chunks = []
    for i in range(0, len(packages), 128):
        chunks.append(packages[i:i + 128])
    print("Number of chunks for unscoped:", len(chunks))
    
    counts = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(retryable_request, get_download_counts_bulk, chunk): chunk for chunk in chunks }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Unscoped packages"):
            res = future.result()
            if res != -1:
                counts += res
            
    return counts
    
def fetch_download_counts_scoped(packages, max_workers=5):
    counts = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = { executor.submit(retryable_request, get_download_count, package): package for package in packages }
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Scoped Packages"):
            res = future.result()
            if res != -1:
                counts.append((futures[future], res))
                
    return counts


def fetch_download_counts(packages, max_workers_unscoped=5, max_workers_scoped=5):
    scoped = []
    unscoped = []
    for package in packages:
        if package.find("@") != -1:
            scoped.append(package)
        else:
            unscoped.append(package)
    print(f'scoped: {len(scoped)}\tunscoped: {len(unscoped)}')
    
    counts = fetch_download_counts_unscoped(unscoped, max_workers=max_workers_unscoped)
    counts += fetch_download_counts_scoped(scoped, max_workers=max_workers_scoped)

    return counts

def update_download_counts(features_dict):
    counts = fetch_download_counts(list(features_dict.keys()), max_workers_scoped=10)
    for p, c in counts:
        features_dict[p]["download_count"] = c
    
# Test function
fetch_download_counts(["semver", "axios", "@vue/cli-plugin-unit-mocha"])

scoped: 1	unscoped: 2
Number of chunks for unscoped: 1


Unscoped packages:   0%|          | 0/1 [00:00<?, ?it/s]

Scoped Packages:   0%|          | 0/1 [00:00<?, ?it/s]

[('semver', 16072016131),
 ('axios', 2562406661),
 ('@vue/cli-plugin-unit-mocha', 1215114)]

In [50]:
update_download_counts(feats)

scoped: 8601	unscoped: 12397
Number of chunks for unscoped: 97


Unscoped packages:   0%|          | 0/97 [00:00<?, ?it/s]

Scoped Packages:   0%|          | 0/8601 [00:00<?, ?it/s]

In [59]:
def save_dataset(features, edges, feature_filename="./dataset/package-features.json", edges_filename="./dataset/dep-graph.json"):
    with open(feature_filename, "w") as feat_file:
        print(f"Saving features to {feature_filename}")
        json.dump(features, feat_file, indent=4)
    with open(edges_filename, "w") as edge_file:
        print(f"Saving edges to {edges_filename}")
        json.dump(edges, edge_file, indent=4)
        
save_dataset(feats, graph)

Saving features to ./dataset/package-features.json
Saving edges to ./dataset/dep-graph.json


In [52]:
def load_dataset(feature_filename="./dataset/package-features.json", edges_filename="./dataset/dep-graph.json"):
    feats = json.load(open(feature_filename, "r"))
    edges = json.load(open(edges_filename, "r"))
    return feats, edges

In [54]:
def clean_graph(graph, valid):
    for p in graph.keys():
        if p not in valid:
            del graph[p]
            
        else:
            graph[p] = [p for p in graph[p] if p in valid]

In [56]:
feats, graph = load_dataset()

In [57]:
clean_graph(graph, feats.keys())

In [58]:
print(len(feats.keys()))
print(len(graph.keys()))

20998
20998


In [62]:
edges = []

for package, deps in graph.items():
    if len(deps) > 0:
        for dep in deps:
            edges.append((package, dep))

headers = ['package', 'dependency']
df = pd.DataFrame(edges, columns=headers)
df.head()

Unnamed: 0,package,dependency
0,esquery,estraverse
1,@web3modal/wallet,@walletconnect/logger
2,@web3modal/wallet,zod
3,@web3modal/wallet,@web3modal/common
4,@web3modal/wallet,@web3modal/polyfills


In [63]:
len(df)

60970

In [64]:
df.to_csv("./dataset/dep-graph.csv", index=False)

In [65]:
feat_data = []
headers = ['package', 'latest_version', 'keywords', 'publish_time', 'description', 'creation_date', 'number_of_versions', 'download_count']
for package, feats in feats.items():
    feat_data.append([package] + list(feats.values()))

feat_df = pd.DataFrame(feat_data, columns=headers)
feat_df.head()

Unnamed: 0,package,latest_version,keywords,publish_time,description,creation_date,number_of_versions,download_count
0,esquery,1.6.0,"[ast, ecmascript, javascript, query]",1720427718,A query library for ECMAScript AST using a CSS...,2013-07-24T00:20:02.985Z,17,1652655533
1,napi-build-utils,1.0.2,"[n-api, prebuild, prebuild-install]",1583419393,A set of utilities to assist developers of too...,2018-09-12T23:11:05.489Z,3,288107438
2,@web3modal/wallet,5.1.11,"[web3, crypto, ethereum, web3modal, walletconn...",1727852066,#### 🔗 [Website](https://web3modal.com),2023-12-05T11:45:35.301Z,183,1857226
3,caniuse-lite,1.0.30001680,[support],1731253639,"A smaller version of caniuse-db, with only the...",2017-01-27T16:04:02.256Z,855,2353828288
4,fullcalendar,6.1.15,"[calendar, event, full-sized, fullcalendar]",1720792076,Easily render a full-sized drag & drop calenda...,2015-03-09T09:09:27.809Z,97,7134336


In [66]:
feat_df.to_csv("./dataset/package-feats.csv", index=False)

In [67]:
len(feat_df)

20998

## Pagerank

In [68]:
import pandas as pd
import numpy as np
import networkx as nx
import json
from scipy.sparse import csr_matrix

dep_df = pd.read_csv("./dataset/dep-graph.csv")
graph = json.load(open("./dataset/dep-graph.json"))
feats = json.load(open("./dataset/package-features.json"))

In [69]:
def show_edge_info(df):
    sources = set(df['package'].unique())
    deps = set(df['dependency'].unique())
    pkg_space = sources.union(deps)
    
    print("Unique packages in packages column", len(sources))
    print("Unique packages in dependency column", len(deps))
    print("Total unique", len(pkg_space))

show_edge_info(dep_df)

Unique packages in packages column 13261
Unique packages in dependency column 14629
Total unique 18586


In [70]:
def show_node_info():
    no_out = {p for p, d in graph.items() if len(d) == 0}
    print("Nodes with 0 out-degree:", len(no_out))
    
    with_in = set()
    for dependencies in graph.values():
        with_in.update(dependencies)
    
    no_in = no_out - with_in
    
    print("Nodes with 0 in-degree and 0 out-degree:", len(no_in))
    
    json_str = json.dumps(json.load(open("./playground/dep-graph.json")))
    from_original = [p for p in no_in if json_str.find(p) != -1]
    print("Nodes with 0 in- and out-degree from source:", len(from_original))
    
show_node_info()

Nodes with 0 out-degree: 7737
Nodes with 0 in-degree and 0 out-degree: 2412
Nodes with 0 in- and out-degree from source: 314


In [71]:
# build adjacency matrix
G = nx.from_pandas_edgelist(dep_df, source='package', target='dependency', create_using=nx.Graph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

Adjacency Matrix shape: (18586, 18586)
G: Graph with 18586 nodes and 60941 edges


In [115]:
# build adjacency matrix using project data
from itertools import combinations
from collections import Counter
import math

def filter_top_by_download_count(packages, top=500):
    packages.sort(key=lambda x: feats[x]['download_count'], reverse=True)
    return packages[:top]

def filter_valid_packages(packages):
    return [p for p in packages if p in feats]

def create_co_dependency_graph(projects, limit=500, include_edge_weights=False):
    edges = []
    
    for project, data in tqdm(projects.items()):
        deps = [p for p in data.keys() if p in feats]
        deps = filter_top_by_download_count(deps, top=limit)
        deps = filter_valid_packages(deps)
        
        for dep1, dep2 in combinations(deps, 2):
            edges.append(tuple(sorted((dep1, dep2)))) 
    
    # Convert the set of edges to a list
    if not include_edge_weights:
        return list(edges)
    else:
        Counter(edges)

project_data = json.load(open("./GithubScrape/Data/40_Projects/append_dependencies_40_1.json"))
edges = create_co_dependency_graph(project_data, limit=100)
edge_df = pd.DataFrame(edges, columns=['from', 'to'])
edge_df

  0%|          | 0/139 [00:00<?, ?it/s]

Unnamed: 0,from,to
0,chalk,path-to-regexp
1,axios,chalk
2,chalk,chokidar
3,chalk,eslint
4,chalk,core-js
...,...,...
562202,agent-base,globby
562203,globby,mime
562204,@babel/code-frame,agent-base
562205,@babel/code-frame,mime


In [74]:
edge_df = edge_df.drop_duplicates()
edge_df.to_csv("./dataset/edges.csv", index=False)

In [75]:
print(len(edge_df))
sources = set(edge_df['from'].unique())
deps = set(edge_df['to'].unique())
pkg_space = sources.union(deps)

print("Unique packages in packages column", len(sources))
print("Unique packages in dependency column", len(deps))
print("Total unique", len(pkg_space))

78929
Unique packages in packages column 1092
Unique packages in dependency column 1121
Total unique 1124


In [76]:
G = nx.from_edgelist(edges, create_using=nx.Graph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

Adjacency Matrix shape: (1124, 1124)
G: Graph with 1124 nodes and 78929 edges


In [77]:
# Perform Pagerank
ranks = nx.pagerank(G, alpha=0.85)
ranks = list(ranks.items())
ranks.sort(key=lambda x: x[1], reverse=True)
ranks[:10]

[('eslint', 0.004274396175829344),
 ('lodash', 0.004168111556486117),
 ('debug', 0.0036053499828414865),
 ('semver', 0.0033273687045163483),
 ('react', 0.0031157225122793045),
 ('react-dom', 0.0031157225122793045),
 ('chalk', 0.0030168474997097254),
 ('typescript', 0.002896055338433709),
 ('webpack', 0.0028571995392590688),
 ('@types/node', 0.0027731154552677576)]

## Pagerank with Weighted Nodes

In [78]:
weights = {p: data["download_count"] for p, data in feats.items()}
weights

{'esquery': 1652655533,
 'napi-build-utils': 288107438,
 '@web3modal/wallet': 1857226,
 'caniuse-lite': 2353828288,
 'fullcalendar': 7134336,
 'input-format': 27376897,
 '@msgpackr-extract/msgpackr-extract-linux-arm64': 26584862,
 'gulp.spritesmith': 882235,
 'vue-server-renderer': 19266277,
 'dotenv': 1949577235,
 '@types/supports-color': 44086061,
 'gl-surface3d': 2260504,
 'npm-profile': 32917356,
 '@aws-sdk/middleware-endpoint': 89581744,
 '@egjs/component': 4807896,
 '@oxlint/linux-x64-musl': 1430443,
 '@tryghost/metrics': 223466,
 '@stoplight/types': 103828415,
 '@sentry/replay': 179753171,
 'parse-imports': 14025942,
 'css-font-parser': 700219,
 'connect-mongo': 6650514,
 'loopback': 1016674,
 '@tryghost/webhook-mock-receiver': 185203,
 '@styled-icons/fluentui-system-filled': 772125,
 '@standardnotes/spreadsheets': 630,
 '@docusaurus/logger': 16059358,
 'buffer-xor': 447396111,
 'unist-util-inspect': 17282203,
 '@ng-select/ng-select': 17644923,
 '@types/mjml-core': 9905685,
 '@s

In [79]:
ranks = nx.pagerank(G, alpha=0.85, personalization=weights)
ranks = list(ranks.items())
ranks.sort(key=lambda x: x[1], reverse=True)
ranks[:20]

[('semver', 0.0051758672171208844),
 ('debug', 0.005130059837030141),
 ('chalk', 0.004972869283451725),
 ('ansi-styles', 0.004659955512107173),
 ('supports-color', 0.004598838295338345),
 ('ms', 0.004202300102638984),
 ('has-flag', 0.004094778340963557),
 ('commander', 0.004012626669122668),
 ('color-convert', 0.0039425636058244085),
 ('color-name', 0.003930824272528445),
 ('glob', 0.003910646519682998),
 ('lodash', 0.0038448208883635217),
 ('minimatch', 0.0037279046102919435),
 ('tslib', 0.003643869474862734),
 ('source-map', 0.003598726650727819),
 ('lru-cache', 0.0035682791368990647),
 ('@types/node', 0.003471374554200045),
 ('strip-ansi', 0.003421555376007238),
 ('brace-expansion', 0.003405240080488884),
 ('safe-buffer', 0.003399802183144039)]

## Pagerank with Keywords

In [139]:
# define a function to calculate similarity scores based on keywords
def jaccard_similarity(a, b):
    a = set(a)
    b = set(b)
    return len(a & b) / len(a | b)

def pagerank_keywords(keyword_list: list, features, top=100, weights=None):
    if len(keyword_list) > 0:
        personalization = {pkg: jaccard_similarity(features[pkg]["keywords"], keyword_list)  for pkg in G.nodes}
        if sum(personalization.values()) == 0:
            personalization = {pkg: 1 for pkg in features.keys()}
        
        scores = nx.pagerank(G, alpha=0.1, personalization=personalization, weight=weights)
        
    else:
        scores = nx.pagerank(G, alpha=0.1, weight=weights)
        
    ranks = list(scores.items())
    ranks.sort(key=lambda x: x[1], reverse=True)
    return ranks[:top]

pagerank_keywords([
        "admin",
        "admin-dashboard",
        "admin-template",
        "axios",
        "dashboard",
        "desktop",
        "element",
        "element-ui",
        "i18n",
        "management-system",
        "mock",
        "tinymce",
        "vue",
        "vue-admin",
        "vue-cli",
        "vuex",
        "webpack",
        "webpack4",
        "xlsx"
    ], feats)

[('@vue/cli-plugin-unit-mocha', 0.03796220505765626),
 ('vue-i18n', 0.02326790930147401),
 ('quasar', 0.01901914941789684),
 ('vue', 0.014702999432852387),
 ('file-loader', 0.014266854004038643),
 ('webpack-command', 0.014163368799256422),
 ('@vue/shared', 0.014067148159081604),
 ('@vue/compiler-sfc', 0.014067148159081604),
 ('@vue/compiler-core', 0.014067148159081604),
 ('@vue/compiler-dom', 0.014067148159081604),
 ('@vue/compiler-ssr', 0.014067148159081604),
 ('@vue/reactivity', 0.014067148159081604),
 ('@vue/server-renderer', 0.014067148159081604),
 ('@vue/runtime-core', 0.014067148159081604),
 ('@vue/runtime-dom', 0.014067148159081604),
 ('@vue/reactivity-transform', 0.014067148159081604),
 ('karma-webpack', 0.014049966768512882),
 ('style-loader', 0.013993145453743383),
 ('raw-loader', 0.013993145453743383),
 ('imports-loader', 0.013993145453743383),
 ('schema-utils', 0.013905538526479664),
 ('vue-template-compiler', 0.013810859897932366),
 ('@vue/cli-service', 0.01381085989793236

####  Run tests to see how many of the top 100 packages predicted by pagerank are actually present as project dependencies

In [81]:
project_data = json.load(open("./GithubScrape/Data/40_With_Description/append_dependencies_40_4.json"))

In [144]:
def test_pagerank(project_deps, package_feats, search_terms, weights=None):
    suggested_deps = [p for p,s in pagerank_keywords(search_terms, package_feats, weights=weights)]
    count = 0
    for d in suggested_deps:
        if d in project_deps:
            count += 1
    
    return count, suggested_deps
    
test_pagerank(project_data["gpt4free"].keys(), feats, ['chatbot', 'chatbots', 'chatgpt', 'chatgpt-4', 'chatgpt-api', 'chatgpt-free', 'chatgpt4', 'free-gpt', 'gpt', 'gpt-3', 'gpt-4', 'gpt3', 'gpt4', 'gpt4-api', 'language-model', 'openai', 'openai-api', 'openai-chatgpt', 'python', 'reverse-engineering'])

(26,
 ['eslint',
  'lodash',
  'debug',
  'react',
  'react-dom',
  'semver',
  'typescript',
  'lint-staged',
  'webpack',
  'husky',
  'chalk',
  'axios',
  'eslint-plugin-import',
  '@types/node',
  'sass',
  'glob',
  'prettier',
  'postcss',
  'rimraf',
  'once',
  'jquery',
  'commander',
  'eslint-plugin-react',
  'yargs',
  'resolve',
  'mocha',
  'express',
  'mkdirp',
  'moment',
  'mime',
  'chai',
  '@babel/core',
  '@types/react',
  '@types/react-dom',
  'antd',
  'uuid',
  'ms',
  'source-map',
  'chokidar',
  'dayjs',
  'esbuild',
  'picocolors',
  'micromatch',
  '@babel/runtime',
  'inherits',
  'autoprefixer',
  'wrappy',
  'diff',
  'clipboard',
  'function-bind',
  'graceful-fs',
  'normalize-path',
  'safe-buffer',
  'eslint-plugin-jsx-a11y',
  'yaml',
  'strip-ansi',
  'buffer',
  'babel-eslint',
  'is-number',
  'braces',
  'fill-range',
  'to-regex-range',
  'vue',
  'vue-router',
  'picomatch',
  '@babel/eslint-parser',
  'dotenv',
  'minimatch',
  'brace-expan

In [147]:
project_df = pd.read_csv("./GithubScrape/Data/40_With_Description/github_40_projects_with_topics_4.csv")

def convert_str_to_list(string):
    string = string.strip()
    if not string.startswith("[") or not string.endswith("]"):
        return []
    
    string = string[1:-1].strip()
    
    if not string:
        return []
    
    keywords = [item.strip().strip("'").strip('"') for item in string.split(",")]
    
    return [keyword for keyword in keywords if keyword]

def test_pagerank_kewords_all_projects(project_data, project_df, weights=None):
    total = 0
    for i, r in project_df.iterrows():
        project = r['name']
        keywords = convert_str_to_list(r['topics'])
        if project in project_data:
            dependencies = project_data[project].keys()
            test_count, _ = test_pagerank(project_data[project], feats, keywords, weights=weights)
            print(f"{project:70}\t{test_count}/{len(dependencies)}")
            total += test_count
            
    print(f"----------\ntotal {total}")
    
    
test_pagerank_kewords_all_projects(project_data, project_df)
    

vue-element-admin                                                     	29/54
grafana                                                               	51/2574
strapi                                                                	62/2724
AdminLTE                                                              	22/1343
metabase                                                              	63/2399
tabler                                                                	17/1003
ant-design-pro                                                        	14/38
tesseract.js                                                          	64/618
hugo                                                                  	1/20
Ghost                                                                 	72/3170
halo                                                                  	40/1678
directus                                                              	48/1921
payload                                                     

#### Try to enhance graph with package dependency graph

In [84]:
dep_df = dep_df.rename(columns={"package": "from", "dependency": "to"})

In [95]:
combined_df = pd.concat([dep_df, edge_df]).drop_duplicates()
combined_df = combined_df.dropna()
combined_df

Unnamed: 0,from,to
0,esquery,estraverse
1,@web3modal/wallet,@walletconnect/logger
2,@web3modal/wallet,zod
3,@web3modal/wallet,@web3modal/common
4,@web3modal/wallet,@web3modal/polyfills
...,...,...
552302,countup,iview-area
552303,countup,v-org-tree
552304,tree-table-vue,iview-area
552305,tree-table-vue,v-org-tree


In [96]:
G = nx.from_pandas_edgelist(combined_df, source='from', target='to', create_using=nx.Graph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

Adjacency Matrix shape: (18668, 18668)
G: Graph with 18668 nodes and 139066 edges


In [97]:
pagerank_keywords(["react"], feats)

[('prop-types', 0.008684007139889167),
 ('@react-email/components', 0.005554705716167797),
 ('hoist-non-react-statics', 0.004876583931877228),
 ('react-is', 0.004816007593449088),
 ('react', 0.004728965208355337),
 ('scheduler', 0.0046910299792120485),
 ('react-dom', 0.004620654043264042),
 ('react-reconciler', 0.004571199437576779),
 ('react-imgix', 0.00450241036905947),
 ('ui-box', 0.004499099399828514),
 ('create-react-class', 0.004497625132562109),
 ('react-deep-force-update', 0.004495059192143982),
 ('server-only', 0.0044944628672286225),
 ('client-only', 0.0044937859143970044),
 ('react-refresh', 0.004483848822196121),
 ('staged-components', 0.004482147021611535),
 ('rc-util', 0.0031408269401321503),
 ('classnames', 0.0027612777254695537),
 ('react-inlinesvg', 0.002328217514790969),
 ('@alifd/field', 0.0023227244384336444),
 ('@react-email/render', 0.0023172209502169234),
 ('@reach/router', 0.002275672001817534),
 ('rc-dropdown', 0.0022750353638575087),
 ('react-pivottable', 0.00

In [99]:
test_pagerank_kewords_all_projects(project_data, project_df)

vue-element-admin                                                     	7/54
grafana                                                               	24/2579
strapi                                                                	20/2730
AdminLTE                                                              	22/1343
metabase                                                              	18/2397
tabler                                                                	6/1003
ant-design-pro                                                        	5/38
tesseract.js                                                          	12/618
hugo                                                                  	0/20
Ghost                                                                 	18/3170
halo                                                                  	20/1678
directus                                                              	9/1921
payload                                                         

### Include Edge Weights based on number of co-occurrences

In [141]:
edges = create_co_dependency_graph(project_data, limit=100)
print(len(edges))

  0%|          | 0/139 [00:00<?, ?it/s]

562207


In [142]:
edges_weighted = [(e1,e2,w) for (e1,e2),w in Counter(edges).items()]
edges_weighted_df = pd.DataFrame(edges_weighted, columns=['from', 'to', 'weight'])
G = nx.from_pandas_edgelist(edges_weighted_df, source='from', target='to', edge_attr='weight', create_using=nx.Graph)
adj_matrix_sparse = csr_matrix(nx.adjacency_matrix(G))
print("Adjacency Matrix shape:", adj_matrix_sparse.shape)
print("G:", G)

ranks = nx.pagerank(G, alpha=0.1, weight='weights')
sorted(ranks.items(), key=lambda x: x[1], reverse=True)

Adjacency Matrix shape: (1124, 1124)
G: Graph with 1124 nodes and 78929 edges


[('eslint', 0.0014047142078523764),
 ('lodash', 0.0012855008938199837),
 ('debug', 0.0012302959622737379),
 ('react', 0.0012249104908250246),
 ('react-dom', 0.0012249104908250246),
 ('semver', 0.0012097131477822555),
 ('typescript', 0.0011803274143015901),
 ('lint-staged', 0.0011787008721218253),
 ('webpack', 0.0011750324030241955),
 ('husky', 0.001148749838860699),
 ('chalk', 0.001146021314887613),
 ('axios', 0.001140278617782112),
 ('eslint-plugin-import', 0.0011387279743365728),
 ('@types/node', 0.0011368211969868753),
 ('sass', 0.001129157164223125),
 ('glob', 0.0011220252149478668),
 ('prettier', 0.001118022618843929),
 ('postcss', 0.001115701246292736),
 ('rimraf', 0.001101499907640227),
 ('once', 0.0010919519225716986),
 ('jquery', 0.0010769331261211005),
 ('commander', 0.0010706834292360794),
 ('eslint-plugin-react', 0.0010668107036943047),
 ('yargs', 0.0010656884690392544),
 ('resolve', 0.0010654276333799191),
 ('mocha', 0.0010591593229035082),
 ('express', 0.00105614198322502

In [148]:
test_pagerank_kewords_all_projects(project_data, project_df, weights='weights')

vue-element-admin                                                     	29/54
grafana                                                               	51/2574
strapi                                                                	62/2724
AdminLTE                                                              	22/1343
metabase                                                              	63/2399
tabler                                                                	17/1003
ant-design-pro                                                        	14/38
tesseract.js                                                          	64/618
hugo                                                                  	1/20
Ghost                                                                 	72/3170
halo                                                                  	40/1678
directus                                                              	48/1921
payload                                                     

## Pagerank with Description

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(string1, string2):
    vectorizer = TfidfVectorizer()
    
    tfidf_matrix = vectorizer.fit_transform([string1, string2])
    
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return similarity

def pagerank_description(desc: str, features, top=100):
    weights = {pkg: compute_similarity(data["description"], desc)  for pkg, data in features.items()}

    scores = nx.pagerank(G, alpha=0.05, personalization=weights)
    ranks = list(scores.items())
    print(scores["react"])
    ranks.sort(key=lambda x: x[1], reverse=True)
    return ranks[:top]

pagerank_description(':tada: A magical vue admin', feats)

1.9282602533617736e-07


[('@vue/compat', 0.010448450041348263),
 ('@vue/reactivity', 0.009768515569990324),
 ('@strapi/admin', 0.00954382495935774),
 ('vue-router', 0.00891760348747064),
 ('@vue/compiler-sfc', 0.007782016829624084),
 ('@vue/compiler-dom', 0.0076883764970911974),
 ('@vue/runtime-dom', 0.007641998160956878),
 ('@vue/compiler-core', 0.007608993594030151),
 ('@vue/compiler-ssr', 0.007594015172613479),
 ('@vue/runtime-core', 0.007589247069952104),
 ('@intlify/vue-i18n-extensions', 0.007519486266899021),
 ('@vue/server-renderer', 0.007512828244070423),
 ('vue-jest', 0.007491931495056499),
 ('@vitejs/plugin-vue', 0.007482651004727015),
 ('vue-codemod', 0.007470379090831667),
 ('@vue/reactivity-transform', 0.007450890534216568),
 ('@storybook/vue3', 0.007435337929129417),
 ('docsify', 0.007404747733468548),
 ('@storybook/vue', 0.00740076346111362),
 ('@loki/integration-vue', 0.007399667783852435),
 ('vue-cli-plugin-mockjs', 0.00739861911358024),
 ('mavon-editor', 0.00739825402980637),
 ('@vue/vue3-je