## Get data

Read data from anaconda and pypi. Get package data from repodata.json

```bash
curl https://repo.anaconda.com/pkgs/main/linux-64/repodata.json -o repodata.json
jq -r '.packages[].name' repodata.json | uniq > anaconda.txt
```

Used [pypinfo](https://github.com/ofek/pypinfo) to query the top 5000 pypi downloads in Google Big Query. Follow the directions to create a project on BigQuery. Note that this query is reduced to the top 1000.

```bash
pypinfo -l 5000 -j --days 365 "" project > popular-pypi-downloads.json
```

In [70]:
import json
from operator import itemgetter

anaconda_data = open('anaconda.txt')
anaconda_pkgs = set()
for pkg in anaconda_data:
    anaconda_pkgs.add(pkg.rstrip())

pypi_data = open('popular-pypi-downloads.json')
pypi_json = json.load(pypi_data)
pypi_pkgs = set()
pypi_projects = pypi_json['rows']
pypi_top1k = pypi_projects[:1000]                          
for project in pypi_top1k:
    pypi_pkgs.add(project.get('project'))

## Compare the difference

In [71]:
pkg_diff = sorted(pypi_pkgs.difference(anaconda_pkgs))
pkg_intersect = sorted(pypi_pkgs.intersection(anaconda_pkgs))
print('Number of Anaconda packages that are not in PyPI:\n {}'.format(len(pkg_diff)))
print('\nAnaconda packages not in PyPI:\n')
for pkg in pkg_diff:
      print(pkg)
print('\nNumber of Anaconda packages that are in PyPI:\n {}'.format(len(pkg_intersect)))
print('Anaconda packages in PyPi:\n')
for pkg in pkg_intersect:
      print(pkg)

Number of Anaconda packages that are not in PyPI:
 613

Anaconda packages not in PyPI:

acme
adal
altgraph
amqp
analytics-python
aniso8601
ansible
antlr4-python3-runtime
apache-beam
applicationinsights
appnope
apscheduler
argh
argparse
aspy-yaml
attrdict
autobahn
avro
awscli
awscli-cwlogs
awsebcli
azure-batch
azure-cli-acr
azure-cli-acs
azure-cli-advisor
azure-cli-appservice
azure-cli-backup
azure-cli-batch
azure-cli-batchai
azure-cli-billing
azure-cli-cdn
azure-cli-cloud
azure-cli-cognitiveservices
azure-cli-command-modules-nspkg
azure-cli-configure
azure-cli-consumption
azure-cli-container
azure-cli-core
azure-cli-cosmosdb
azure-cli-dla
azure-cli-dls
azure-cli-eventgrid
azure-cli-extension
azure-cli-feedback
azure-cli-find
azure-cli-interactive
azure-cli-iot
azure-cli-keyvault
azure-cli-lab
azure-cli-monitor
azure-cli-network
azure-cli-nspkg
azure-cli-profile
azure-cli-rdbms
azure-cli-redis
azure-cli-reservations
azure-cli-resource
azure-cli-role
azure-cli-servicefabric
azure-cli-sql

In [72]:
missing_pkg_list = []
for pkg in pkg_diff:
    for prj in by_download_counts:
        if pkg == prj.get('project'):
            missing_pkg_list.append(prj)

by_download_counts = sorted(missing_pkg_list, key = itemgetter('download_count'), reverse=True)

for download in by_download_counts:
    print(download)

{'download_count': 220589827, 'project': 'awscli'}
{'download_count': 220184575, 'project': 'rsa'}
{'download_count': 100903711, 'project': 'awscli-cwlogs'}
{'download_count': 47059653, 'project': 'google-api-core'}
{'download_count': 42725585, 'project': 'argparse'}
{'download_count': 42312992, 'project': 'google-cloud-core'}
{'download_count': 38210088, 'project': 'cachetools'}
{'download_count': 36510505, 'project': 'google-auth'}
{'download_count': 35081177, 'project': 'httplib2'}
{'download_count': 33981589, 'project': 'oauth2client'}
{'download_count': 33693132, 'project': 'google-api-python-client'}
{'download_count': 29790532, 'project': 'uritemplate'}
{'download_count': 29473624, 'project': 'google-resumable-media'}
{'download_count': 28892088, 'project': 'docker'}
{'download_count': 28719299, 'project': 'docker-pycreds'}
{'download_count': 28552132, 'project': 'prompt-toolkit'}
{'download_count': 27532499, 'project': 'google-cloud-bigquery'}
{'download_count': 26027857, 'proj

## Calculate total downloads

In [73]:
download_total = 0
for pkg in pypi_top1k:
    download_total = download_total + pkg.get('download_count')
    
missing_total = 0
for pkg in missing_pkg_list:
    missing_total = missing_total + pkg.get('download_count')
    
print('Total PyPi downloads 15561303313')
print('Total top 1K downloads: {}'.format(download_total))
print('Total missing pkg downloads: {}'.format(missing_total))
print('Percentage of top 1K downloads: {}'.format(download_total/15561303313*100))
print('Percentage of missing packages in top 1k downloads: {}'.format(missing_total/download_total*100))
print('Percentage of missing packages in total downloads: {}'.format(missing_total/15561303313*100))


Total PyPi downloads 15561303313
Total top 1K downloads: 14403311239
Total missing pkg downloads: 3323612917
Percentage of top 1K downloads: 92.55851485760445
Percentage of missing packages in top 1k downloads: 23.075339148407885
Percentage of missing packages in total downloads: 21.35819121412173


## Get conda-forge data

In [74]:
from bs4 import BeautifulSoup
import requests

url = 'http://conda-forge.org/feedstocks/'
cf_feedstocks = requests.get(url)
feedstocks = set()
soup = BeautifulSoup(cf_feedstocks.content, 'html.parser')
for item in soup.find_all('li', attrs={'class':'list-group-item'}):
    feedstocks.add(item.find('a').contents[0].lstrip().rstrip())

for fs in feedstocks:
    print(fs)


flask-potion
r-bwstest
sphinxcontrib-srclinks
httpretty
distro
allennlp
osmnx
r-simcomp
maya
port-for
pyjwt
r-gsubfn
setuptools-markdown
agg-regrid
nauty
pseudonetcdf
schema-salad
coursera-dl
r-msgps
c99-to-c89
pywin32
r-rastervis
pytest-ignore-flaky
cwltool
r-geor
request
sunpy
r-htmltools
pymemcache
r-spacyr
easy-thumbnails
r-lightgbm
r-lmertest
r-pscl
pygments
reg
r-npsurv
pyct
r-pinfsc50
keystoneauth1
openmp
mycli
r-rngtools
r-prettygraphs
r-doparallel
nipy
pyotp
isl
diff-match-patch
pysot
netcdf-scm
r-intamap
r-ptw
nidaqmx-python
wq.core
pyjnius
r-bmp
shutilwhich-cwdpatch
cudatoolkit-dev
m2r
r-sn
shapelib
halide
xorg-xtrans
libobjcryst
r-biclust
leptonica
r-goodpractice
backports_abc
pep8-naming
dis3
freecad
r-promises
pylint
r-rccmisc
r-nloptr
flake8-builtins
lalsimulation
r-subselect
ldaptor
enum34
spyder-reports
aioftp
r-tfdatasets
doc2dash
bisonpp
terraform-provider-logentries
tempdir
ipaddress
sphinx-confluence
r-sp
r-waffle
aioeasywebdav
jupyterlab_launcher
pdbpp
auditwheel


r-shinyalert
arpack
wsgiref
kenjutsu
r-obistools
aws-iam-authenticator
django-bakery
r-docopt
pyaudio
r-uuid
gwosc
xsv
f2c
pygithub
py_stringsimjoin
mshr
psyplot-gui
radical.analytics
scs
click-completion
datreant.data
phonenumbers
intel-hybrid-driver
r-lsa
r-anytime
r-relaimpo
typing_extensions
pytorch-pretrained-bert
r-tidyposterior
vue
markov_draftjs
xorg-xf86bigfontproto
oset
r-textclean
r-goftest
atomap
cfitsio
calliope
pysdl2
mmtf-python
ujson
jug
configobj
r-rsm
larray_eurostat
bunch
jupyter-hdfscm
r-svd
libgd
ijroi
r-metrics
r-paco
r-worrms
pygc
x264
jupyter_server
pygypsy
pymsgbox
python-memcached
r-clustermq
r-intervals
wasabi
py6s
weightedcalcs
dependencies
bob
python-libarchive-c
r-proj4
python-twitter
sedflux
conda-devenv
r-ttr
vifm
urllib3
perl-xml-parser
r-plspm
ipaddr
autodocsumm
r-snftool
jpeg
premailer
xorg-font-util
databases
r-idpmisc
astcheck
cmt
r-r2d3
r-geometry
yarn
aioslacker
pyreadline
r-cvxbiclustr
ipywidget-pivot-table
otwrapy
pyorbital
wrapt
r-gsalib
ps2ff


drms
clrng
r-tibble
r-functional
r-ca
neurdflib
ntplib
google-auth
r-ddrtree
public-wrappers
r-palr
ete3
paramz
mizani
mpi4py
pvl
occt
r-xlconnectjars
r-aplpack
zsh
pycocotools
versiontools
cyclus-build-deps
freetds
r-smatr
r-mgcv
django-controlcenter
r-venndiagram
r-maxlik
pytools
vaex-ui
pygraphml
r-rda
udunits2
avrocpp
textract
lp_solve
argon2_cffi
jansson
esmf
pip
s3cmd
pymca
pywebhdfs
r-wesanderson
r-fastcluster
electron
r-r.filesets
cartopy
stir
r-diagram
r-gptk
r-chemospec
ldas-tools-frameapi
terraform-provider-arukas
regionmask
r-summarytools
pytest-cookies
gwdatafind
python-constraint
xattr
r-wikipedir
palp
jupyterlab-git
chrpath
breathe
mailer
libnl
qimage2ndarray
r-rcmdcheck
r-r.devices
sleek-lvalert
r-proxy
r-fuzzyjoin
afterimage
swi-prolog
pynac
metafone
r-nor1mix
libassuan
flake8-rst
mysql
d3
r-magrittr
intake-spark
pyiron_dft
escapism
r-ggmap
r-qlcmatrix
whitebox_tools
terraform-provider-vault
jupyter_pivottablejs
r-celestial
r-hms
setuptools_markdown
python-highcharts
r

## Compare conda-forge feedstocks to packages that Anaconda is missing from the top 1000K PyPI packages

In [75]:
cf_pkg_diff = pypi_pkgs.difference(feedstocks)
print("Number of conda-forge pkgs that Anaconda lacks from pypi top 1K: {}\n".format(len(cf_pkg_diff)))
cf_diff_sorted = sorted(cf_pkg_diff)
for cf_pkg in cf_diff_sorted:
    print(cf_pkg)

Number of conda-forge pkgs that Anaconda lacks from pypi top 1K: 340

acme
analytics-python
antlr4-python3-runtime
aspy-yaml
awsebcli
azure-batch
azure-cli-acr
azure-cli-acs
azure-cli-advisor
azure-cli-appservice
azure-cli-backup
azure-cli-batch
azure-cli-batchai
azure-cli-billing
azure-cli-cdn
azure-cli-cloud
azure-cli-cognitiveservices
azure-cli-command-modules-nspkg
azure-cli-configure
azure-cli-consumption
azure-cli-container
azure-cli-cosmosdb
azure-cli-dla
azure-cli-dls
azure-cli-eventgrid
azure-cli-extension
azure-cli-feedback
azure-cli-find
azure-cli-interactive
azure-cli-iot
azure-cli-keyvault
azure-cli-lab
azure-cli-monitor
azure-cli-network
azure-cli-nspkg
azure-cli-profile
azure-cli-rdbms
azure-cli-redis
azure-cli-reservations
azure-cli-resource
azure-cli-role
azure-cli-servicefabric
azure-cli-sql
azure-cli-storage
azure-cli-vm
azure-cosmosdb-nspkg
azure-cosmosdb-table
azure-datalake-store
azure-eventgrid
azure-mgmt-advisor
azure-mgmt-applicationinsights
azure-mgmt-batch
az