In [1]:
import pandas as bpd
import numpy as np
import matplotlib.pyplot as plt
import requests as req
import json
import urllib

In [2]:
default_headers = { #Make sure that user-agent is clear
    'User-Agent': "COGS9-PackageManager-DataAnalysis"
}

In [3]:
#def make_pypi_index(): # Not needed, contains over 100K packages with no recent downloads
#    uri = "https://pypi.org/simple/" # simple index URI
#    pypi_index_response = req.get(uri, headers=default_headers)
#    return np.array(
#        list(
#            map(
#                (
#                    lambda first_part:
#                    first_part.split("</a")[0]
#                ),
#                pypi_index_response.text.split("\">")[1:]
#            )
#        )
#    )

In [4]:
#len(make_pypi_index())

In [5]:
def make_gem_index(page=0, previous_data=[]): # Recursive function to make gems index
    max_pages = 1000 # Stop after 1000 pages
    uri = "https://rubygems.org/api/v1/search.json?query=downloads%3A+>0&page=" + str(page) # Generate search URI
    gem_index_response = req.get(uri, headers=default_headers) # Get search URI
    if(
        gem_index_response.text != "[]" # Page after last page returns empty array
        and page < max_pages
    ):
        response_object = json.loads(gem_index_response.text) # Read page into JSON
        read_data = np.array(
            list(
                map(
                    lambda package: {
                        "name": package["name"],
                        "uri": package["gem_uri"],
                        "downloads": int(package["downloads"])
                    }
                    , response_object)
            )
        ) # Weird map thing to make it all into one array
        return make_gem_index(page=page + 1, previous_data=np.append(previous_data, read_data)) # Get the next page
    else: # Once searching is done, or on max page
        return previous_data # Break out

In [6]:
gem_index = bpd.DataFrame(list(make_gem_index())).set_index("name") # Make gem index -> DataFrame
gem_index

Unnamed: 0_level_0,uri,downloads
name,Unnamed: 1_level_1,Unnamed: 2_level_1
rspec-expectations,https://rubygems.org/gems/rspec-expectations-3...,554814069
rspec-core,https://rubygems.org/gems/rspec-core-3.10.0.gem,553992963
rspec-mocks,https://rubygems.org/gems/rspec-mocks-3.10.0.gem,549115801
diff-lcs,https://rubygems.org/gems/diff-lcs-1.4.4.gem,548459800
rspec-support,https://rubygems.org/gems/rspec-support-3.10.0...,527719512
...,...,...
malloc,https://rubygems.org/gems/malloc-1.5.1.gem,86480
omniauth-twitch,https://rubygems.org/gems/omniauth-twitch-1.1....,86449
css_sprite,https://rubygems.org/gems/css_sprite-2.4.0.gem,86430
yieldmanager,https://rubygems.org/gems/yieldmanager-0.9.13.gem,86412


In [7]:
node_data = bpd.read_csv("data/npm_csv.csv") # Read NPM CSV; if this errors, you might have to manually download the file from the repo

In [8]:
node_data = node_data[
    node_data
    .get("uri")
    .apply(lambda uri:
           type(uri) != float) # If the download URI doesn't exist, it shows up as NaN -> which has type float so remove those.
].set_index("name") 
node_data

Unnamed: 0_level_0,uri
name,Unnamed: 1_level_1
format-text,https://registry.npmjs.org/format-text/-/forma...
new-chain,https://registry.npmjs.org/new-chain/-/new-cha...
uniques,https://registry.npmjs.org/uniques/-/uniques-0...
ansi-codes,https://registry.npmjs.org/ansi-codes/-/ansi-c...
door,https://registry.npmjs.org/door/-/door-0.0.1-s...
...,...
vue3-scroll-picker,https://registry.npmjs.org/vue3-scroll-picker/...
com.fight4dream.locomotors.moveinplace.unity,https://registry.npmjs.org/com.fight4dream.loc...
demo11.12,https://registry.npmjs.org/demo11.12/-/demo11....
promisebao,https://registry.npmjs.org/promisebao/-/promis...


In [24]:
def get_node_download_count(package):
    uri = "https://api.npmjs.org/downloads/point/2010-01-01:2030-01-01/" + package # Build API URI
    info_response = req.get(uri, headers=default_headers) # Read API
    if(info_response.status_code != 200 or type(json.loads(info_response.text).get('error', -1)) != int): # Make sure entry exists
        return -1 # Return -1 if in error
    return json.loads(info_response.text)["downloads"]

In [9]:
def get_uri_size(uri): # URI size in bytes
    if(uri == ""):
        return -1
    data = urllib.request.urlopen(uri, timeout=10000) # Not sure if the timeout does anything, put it in just in case
    data.close()
    return int(data.info()["Content-Length"]) # Content of "Content-Length" header

In [10]:
pypi_index = bpd.read_csv("data/pypi_csv.csv").set_index("name") # Read PYPI csv 
pypi_index = pypi_index.assign(downloads=pypi_index.get("num_downloads")).drop(columns=["num_downloads"]) # Rename the num_downloads column to downloads
pypi_index

Unnamed: 0_level_0,downloads
name,Unnamed: 1_level_1
urllib3,2301518
six,1668852
botocore,1646210
requests,1572936
certifi,1521324
...,...
outliers-remover-101883060-1,1
octopussh,1
riskscore,1
docx-xslt,1


In [11]:
def get_pypi_uri(project): # Generate download URI
    uri = "https://pypi.org/pypi/" + str(project) + "/json" # Make API URI
    info_response = req.get(uri, headers=default_headers) # Read API
    if(info_response.status_code != 200): # Make sure entry exists
        return ""
    response_json = json.loads(info_response.text) # Load JSON from response
    return (
        response_json["urls"][0]["url"] # Get first download URI, sometimes it does not exist
            if 
                len(response_json["urls"]) > 0
            else 
                (print("package with no uri"), "")[1] # Return an empty string if no download URI (size goes to -1 in get_uri_size if fed an empty string)
    )

In [12]:
def get_pypi_size(project): # Get download size of PYPI package from name
    return get_uri_size(get_pypi_uri(project))

In [13]:
gem_sizes = gem_index.sample(100).apply(lambda row: (get_uri_size(row.uri)), axis=1) # Make gem sizes of random sample of 100
gem_sizes.name = "size" # Name series so that it can be merged
gem_with_sizes = gem_index.merge(gem_sizes, left_index=True, right_index=True) # Merge gems and file sizes
gem_with_sizes

Unnamed: 0_level_0,uri,downloads,size
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
actionmailer_inline_css,https://rubygems.org/gems/actionmailer_inline_...,422899,10752
activerecord-immutable,https://rubygems.org/gems/activerecord-immutab...,191339,7168
aquarium,https://rubygems.org/gems/aquarium-0.7.3.gem,168018,139264
asciidoctor-plantuml,https://rubygems.org/gems/asciidoctor-plantuml...,5535892,7680
async-dns,https://rubygems.org/gems/async-dns-1.2.5.gem,488149,54272
...,...,...,...
vlad,https://rubygems.org/gems/vlad-2.7.0.gem,136495,29184
websocket-client-simple,https://rubygems.org/gems/websocket-client-sim...,748567,9728
wicked,https://rubygems.org/gems/wicked-1.3.4.gem,3662558,92672
wombat,https://rubygems.org/gems/wombat-2.10.0.gem,132020,1473024


In [29]:
node_sample = node_data.sample(100)
node_downloads = node_sample.apply(lambda row: (get_node_download_count(row.name)), axis=1) # Take downloads
node_downloads.name = "downloads"
node_sizes = node_sample.apply(lambda row: get_uri_size(row.uri), axis=1) # See above ^
node_sizes.name = "size"
node_with_sizes = node_data.merge(node_sizes, left_index=True, right_index=True).merge(node_downloads, left_index=True, right_index=True)
node_with_sizes[node_with_sizes.get("downloads") != -1]

Unnamed: 0_level_0,uri,size,downloads
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
@adonisdavid20/platzom,https://registry.npmjs.org/@adonisdavid20/plat...,2668,200
@alifd/theme-6665,https://registry.npmjs.org/@alifd/theme-6665/-...,897035,347
@annotation-studio/plugin-transcriber,https://registry.npmjs.org/@annotation-studio/...,4536,11928
@bianic-ui/css-reset,https://registry.npmjs.org/@bianic-ui/css-rese...,6159,146
@choojs/nanocache,https://registry.npmjs.org/@choojs/nanocache/-...,5897,152
...,...,...,...
xengine,https://registry.npmjs.org/xengine/-/xengine-1...,3169,230
xingxing,https://registry.npmjs.org/xingxing/-/xingxing...,394,124
xtion-label,https://registry.npmjs.org/xtion-label/-/xtion...,477145,198
yuyii_scribe_send,https://registry.npmjs.org/yuyii_scribe_send/-...,625,121


In [None]:
pypi_sizes = pypi_index.sample(100).apply(lambda row: get_pypi_size(row.name), axis=1) # See above ^
pypi_sizes.name = "size"
pypi_with_sizes = pypi_index.merge(pypi_sizes, left_index=True, right_index=True)
pypi_with_sizes = pypi_with_sizes[pypi_with_sizes.get("size") != -1]
pypi_with_sizes