In [10]:
import requests
from bs4 import BeautifulSoup

In [11]:
url = "https://gist.github.com/anvaka/8e8fa57c7ee1350e3491"
res = requests.get(url)
data = res.content
# print(BeautifulSoup(data, 'html.parser').prettify())

In [12]:
parsedData = BeautifulSoup(data, "html.parser")

In [19]:
topDependencies = parsedData.select("#file-01-most-dependent-upon-md-readme > article > ol > li")
selectors = [
    '#file-01-most-dependent-upon-md-readme > article > ol > li',
    '#file-02-with-most-dependencies-md-readme > article > ol > li',
    '#file-03-pagerank-md-readme > article > ol > li'
]
packages = set()
top_k = 25
for selector in selectors:
    topPackages = parsedData.select(selector)
    for html in topPackages[:top_k]:
        packages.add(html.find('a').text)
packages


{'3.0.0',
 '@ericmcornelius/ease',
 '@ngxvoice/ngx-voicelistner',
 '@rstacruz/pnpm',
 'async',
 'axios',
 'axios-yet-another-proxy-fix',
 'babel-runtime',
 'bloater',
 'bluebird',
 'chalk',
 'classnames',
 'colors',
 'commander',
 'debug',
 'digital-keyboard-demos',
 'express',
 'fhir2',
 'fs-extra',
 'glob',
 'inquirer',
 'js-tokens',
 'lodash',
 'loose-envify',
 'merino',
 'miguelcostero-ng2-toasty',
 'minimist',
 'mkdirp',
 'moment',
 'ms',
 'no-one-left-behind',
 'object-assign',
 'pb-schema',
 'primeng-custom',
 'prop-types',
 'react',
 'react-angular-component',
 'react-dom',
 'react-misc-toolbox',
 'react-native-handcheque-engine',
 'react-native-ok-sdk',
 'react-native-template-vife',
 'react-native-version-manager',
 'react-redux-demo1',
 'readable-stream',
 'regenerator-runtime',
 'request',
 'safe-buffer',
 'search-list-react',
 'sindresorhus.js',
 'tslib',
 'underscore',
 'uuid',
 'vue',
 'vue-compment',
 'wc-starterkit',
 'webche',
 'webpack',
 'yargs'}

In [23]:
from datetime import datetime

packageDependencies = dict()
packageFeatures = dict()

for package in packages:
    npmUrl = "https://registry.npmjs.org/" + package
    res = requests.get(npmUrl)
    
    if res.status_code == 200:
        try:
            jsonData = res.json()
            latestVersion = jsonData["dist-tags"]["latest"]
            packageFeatures[package] = dict()
            
            packageFeatures[package]["latestVersion"] = latestVersion
            packageFeatures[package]["versions"] = dict()
            
            for version in jsonData["versions"]:
                packageFeatures[package]["versions"][version] = dict()
                
                # Extract dependencies for each version of the package
                try:
                    deps = jsonData["versions"][version]["dependencies"]
                    packageFeatures[package]["versions"][version]["deps"] = list(deps.items())
                except KeyError:
                    packageFeatures[package]["versions"][version]["deps"] = []
            
                # Extract keywords for each version of the package
                if "keywords" in jsonData.keys():
                    packageFeatures[package]["versions"][version]["keywords"] = jsonData["keywords"]
                else:
                    packageFeatures[package]["versions"][version]["keywords"] = []
                    
                # Time is in ISO format. Convert to epoch
                publishTime = jsonData["time"][version]
                publishTime = publishTime.replace('Z', '+00:00')
                publishTime = datetime.fromisoformat(publishTime)
                timestamp = datetime.strftime(publishTime, '%s')
                packageFeatures[package]["versions"][version]["timestamp"] = timestamp
            
                # Extract description for each version of the package
                packageFeatures[package]["versions"][version]["description"] = jsonData["description"]
        
            # For each package, get the download count for last 1 year
            period = 'last-year'
            apiUrl = f'https://api.npmjs.org/downloads/point/{period}/{package}'
            res = requests.get(apiUrl)
            packageFeatures[package]["downloads"] = res.json()["downloads"]
            
        except Exception as e:
            print(f"[ERROR] for {package}: {e}")
        
packageFeatures

ERROR for 3.0.0: 'dist-tags'


{'fhir2': {'latestVersion': '1.0.0',
  'versions': {'1.0.0': {'deps': [('@angular/common', '2.0.0'),
     ('@angular/compiler', '2.3.1'),
     ('@angular/core', '2.3.1'),
     ('@angular2-material/core', '^2.0.0-alpha.8-3'),
     ('abbrev', '^1.1.0'),
     ('accepts', '^1.3.3'),
     ('acorn', '^4.0.11'),
     ('acorn-dynamic-import', '^2.0.2'),
     ('adm-zip', '^0.4.7'),
     ('after', '^0.8.2'),
     ('agent-base', '^2.0.1'),
     ('ajv', '^4.11.8'),
     ('ajv-keywords', '^1.5.1'),
     ('align-text', '^0.1.4'),
     ('alphanum-sort', '^1.0.2'),
     ('amdefine', '^1.0.1'),
     ('angular2-infinite-scroll', '^0.3.5'),
     ('ansi-align', '^2.0.0'),
     ('ansi-escapes', '^1.4.0'),
     ('ansi-html', '^0.0.7'),
     ('ansi-regex', '^2.1.1'),
     ('ansi-styles', '^2.2.1'),
     ('any-promise', '^1.3.0'),
     ('anymatch', '^1.3.0'),
     ('app-root-path', '^2.0.1'),
     ('append-transform', '^0.4.0'),
     ('aproba', '^1.1.1'),
     ('are-we-there-yet', '^1.1.4'),
     ('argparse',

In [24]:
import pandas as pd

dep_data = []
feat_data = []

for package, feats in packageFeatures.items():
    latestVersion = feats['latestVersion']
    downloads = feats['downloads']
    
    for version, versionData in feats['versions'].items():
        for dep in versionData['deps']:
            dep_data.append((package, version, dep[0], dep[1]))
            
        feat_data.append((package, version, versionData["keywords"], versionData["description"], versionData['timestamp'], latestVersion, downloads))

In [25]:
dep_df = pd.DataFrame(dep_data, columns=['Package', 'Version', 'Dependency', 'Dependency_Version'])
dep_df

Unnamed: 0,Package,Version,Dependency,Dependency_Version
0,fhir2,1.0.0,@angular/common,2.0.0
1,fhir2,1.0.0,@angular/compiler,2.3.1
2,fhir2,1.0.0,@angular/core,2.3.1
3,fhir2,1.0.0,@angular2-material/core,^2.0.0-alpha.8-3
4,fhir2,1.0.0,abbrev,^1.1.0
...,...,...,...,...
161416,bloater,1.0.0,vuepress,^0.11.0
161417,bloater,1.0.0,webpack,^4.16.1
161418,bloater,1.0.0,webscaledb,^0.1.6
161419,bloater,1.0.0,yo,^2.0.3


In [26]:
feat_df = pd.DataFrame(feat_data, columns=['Package', 'Version', 'Keywords', 'Description', 'Publish_Time', 'Latest_Version', 'Downloads'])
feat_df

Unnamed: 0,Package,Version,Keywords,Description,Publish_Time,Latest_Version,Downloads
0,fhir2,1.0.0,[],This project was generated with [Angular CLI](...,1495542546,1.0.0,104
1,express,0.14.0,"[express, framework, sinatra, web, http, rest,...","Fast, unopinionated, minimalist web framework",1293622705,4.21.1,1519354973
2,express,0.14.1,"[express, framework, sinatra, web, http, rest,...","Fast, unopinionated, minimalist web framework",1293622705,4.21.1,1519354973
3,express,1.0.0,"[express, framework, sinatra, web, http, rest,...","Fast, unopinionated, minimalist web framework",1293622705,4.21.1,1519354973
4,express,1.0.1,"[express, framework, sinatra, web, http, rest,...","Fast, unopinionated, minimalist web framework",1293622705,4.21.1,1519354973
...,...,...,...,...,...,...,...
8218,bloater,0.2.3,[],For when your node_modules folder is not large...,1531998366,1.0.0,344
8219,bloater,0.2.4,[],For when your node_modules folder is not large...,1531998535,1.0.0,344
8220,bloater,0.2.5,[],For when your node_modules folder is not large...,1531999544,1.0.0,344
8221,bloater,0.2.6,[],For when your node_modules folder is not large...,1588244103,1.0.0,344


In [82]:
# TODO: use | for separator as descriptions contains commas and versions contain dots

#### Versioning Info

- `~version` **“Approximately equivalent to version”**, will automatically update you to all future patch versions that are backwards-compatible, without incrementing the minor version. `~1.2.3` will use releases from `1.2.3` to `< 1.3.0`.


- `^version` **“Compatible with version”**, will automatically update you to all future minor/patch versions that are backwards-compatible, without incrementing the major version. `^1.2.3` will use releases from `1.2.3` to `< 2.0.0`.