In [73]:
import requests
from bs4 import BeautifulSoup

In [74]:
url = "https://gist.github.com/anvaka/8e8fa57c7ee1350e3491"
res = requests.get(url)
data = res.content
print(data)



In [75]:
parsedData = BeautifulSoup(data, "html.parser")

In [76]:
topDependencies = parsedData.select("#file-01-most-dependent-upon-md-readme > article > ol > li")

In [77]:
top = 25
packages = [(item.find('a').text, item.find('a')['href']) for item in topDependencies[:top]]
packages

[('lodash', 'https://www.npmjs.org/package/lodash'),
 ('chalk', 'https://www.npmjs.org/package/chalk'),
 ('request', 'https://www.npmjs.org/package/request'),
 ('commander', 'https://www.npmjs.org/package/commander'),
 ('react', 'https://www.npmjs.org/package/react'),
 ('express', 'https://www.npmjs.org/package/express'),
 ('debug', 'https://www.npmjs.org/package/debug'),
 ('async', 'https://www.npmjs.org/package/async'),
 ('fs-extra', 'https://www.npmjs.org/package/fs-extra'),
 ('moment', 'https://www.npmjs.org/package/moment'),
 ('prop-types', 'https://www.npmjs.org/package/prop-types'),
 ('react-dom', 'https://www.npmjs.org/package/react-dom'),
 ('bluebird', 'https://www.npmjs.org/package/bluebird'),
 ('underscore', 'https://www.npmjs.org/package/underscore'),
 ('vue', 'https://www.npmjs.org/package/vue'),
 ('axios', 'https://www.npmjs.org/package/axios'),
 ('tslib', 'https://www.npmjs.org/package/tslib'),
 ('mkdirp', 'https://www.npmjs.org/package/mkdirp'),
 ('glob', 'https://www.n

In [78]:
from datetime import datetime

packageDependencies = dict()
packageFeatures = dict()

for package, url in packages:
    npmUrl = "https://registry.npmjs.org/" + package
    res = requests.get(npmUrl)
    jsonData = res.json()
    latestVersion = jsonData["dist-tags"]["latest"]
    packageFeatures[package] = dict()
    
    packageFeatures[package]["latestVersion"] = latestVersion
    packageFeatures[package]["versions"] = dict()
    
    for version in jsonData["versions"]:
        packageFeatures[package]["versions"][version] = dict()
        
        # Extract dependencies for each version of the package
        try:
            deps = jsonData["versions"][version]["dependencies"]
            packageFeatures[package]["versions"][version]["deps"] = list(deps.items())
        except KeyError:
            packageFeatures[package]["versions"][version]["deps"] = []
    
        # Extract keywords for each version of the package
        if "keywords" in jsonData.keys():
            packageFeatures[package]["versions"][version]["keywords"] = jsonData["keywords"]
        else:
            packageFeatures[package]["versions"][version]["keywords"] = []
            
        # Time is in ISO format. Convert to epoch
        publishTime = jsonData["time"][version]
        publishTime = publishTime.replace('Z', '+00:00')
        publishTime = datetime.fromisoformat(publishTime)
        timestamp = datetime.strftime(publishTime, '%s')
        packageFeatures[package]["versions"][version]["timestamp"] = timestamp
    
        # Extract description for each version of the package
        packageFeatures[package]["versions"][version]["description"] = jsonData["description"]

    # For each package, get the download count for last 1 year
    period = 'last-year'
    apiUrl = f'https://api.npmjs.org/downloads/point/{period}/{package}'
    res = requests.get(apiUrl)
    packageFeatures[package]["downloads"] = res.json()["downloads"]
        
packageFeatures

{'lodash': {'latestVersion': '4.17.21',
  'versions': {'0.1.0': {'deps': [],
    'keywords': ['modules', 'stdlib', 'util'],
    'timestamp': '1335170232',
    'description': 'Lodash modular utilities.'},
   '0.2.0': {'deps': [],
    'keywords': ['modules', 'stdlib', 'util'],
    'timestamp': '1337630784',
    'description': 'Lodash modular utilities.'},
   '0.2.1': {'deps': [],
    'keywords': ['modules', 'stdlib', 'util'],
    'timestamp': '1337867588',
    'description': 'Lodash modular utilities.'},
   '0.2.2': {'deps': [],
    'keywords': ['modules', 'stdlib', 'util'],
    'timestamp': '1338335786',
    'description': 'Lodash modular utilities.'},
   '0.3.0': {'deps': [],
    'keywords': ['modules', 'stdlib', 'util'],
    'timestamp': '1338984109',
    'description': 'Lodash modular utilities.'},
   '0.3.1': {'deps': [],
    'keywords': ['modules', 'stdlib', 'util'],
    'timestamp': '1339359171',
    'description': 'Lodash modular utilities.'},
   '0.3.2': {'deps': [],
    'keywor

In [79]:
import pandas as pd

dep_data = []
feat_data = []

for package, feats in packageFeatures.items():
    latestVersion = feats['latestVersion']
    downloads = feats['downloads']
    
    for version, versionData in feats['versions'].items():
        for dep in versionData['deps']:
            dep_data.append((package, version, dep[0], dep[1]))
            
        feat_data.append((package, version, versionData["keywords"], versionData["description"], versionData['timestamp'], latestVersion, downloads))

In [80]:
dep_df = pd.DataFrame(dep_data, columns=['Package', 'Version', 'Dependency', 'Dependency_Version'])
dep_df

Unnamed: 0,Package,Version,Dependency,Dependency_Version
0,chalk,0.1.0,has-color,~0.1.0
1,chalk,0.1.0,ansi-styles,~0.1.0
2,chalk,0.1.1,has-color,~0.1.0
3,chalk,0.1.1,ansi-styles,~0.1.0
4,chalk,0.2.0,has-color,~0.1.0
...,...,...,...,...
34239,webpack,5.95.0,schema-utils,^3.2.0
34240,webpack,5.95.0,tapable,^2.1.1
34241,webpack,5.95.0,terser-webpack-plugin,^5.3.10
34242,webpack,5.95.0,watchpack,^2.4.1


In [81]:
feat_df = pd.DataFrame(feat_data, columns=['Package', 'Version', 'Keywords', 'Description', 'Publish_Time', 'Latest_Version', 'Downloads'])
feat_df

Unnamed: 0,Package,Version,Keywords,Description,Publish_Time,Latest_Version,Downloads
0,lodash,0.1.0,"[modules, stdlib, util]",Lodash modular utilities.,1335170232,4.17.21,2544258246
1,lodash,0.2.0,"[modules, stdlib, util]",Lodash modular utilities.,1337630784,4.17.21,2544258246
2,lodash,0.2.1,"[modules, stdlib, util]",Lodash modular utilities.,1337867588,4.17.21,2544258246
3,lodash,0.2.2,"[modules, stdlib, util]",Lodash modular utilities.,1338335786,4.17.21,2544258246
4,lodash,0.3.0,"[modules, stdlib, util]",Lodash modular utilities.,1338984109,4.17.21,2544258246
...,...,...,...,...,...,...,...
7534,classnames,2.3.2,"[react, css, classes, classname, classnames, u...",A simple utility for conditionally joining cla...,1663022088,2.5.1,677332625
7535,classnames,2.3.3,"[react, css, classes, classname, classnames, u...",A simple utility for conditionally joining cla...,1703484299,2.5.1,677332625
7536,classnames,2.4.0,"[react, css, classes, classname, classnames, u...",A simple utility for conditionally joining cla...,1703565116,2.5.1,677332625
7537,classnames,2.5.0,"[react, css, classes, classname, classnames, u...",A simple utility for conditionally joining cla...,1703694374,2.5.1,677332625
