## R package downloads

CRAN download stats from [http://cran-logs.rstudio.com/](http://cran-logs.rstudio.com/) and uploaded to [BigQuery](https://bigquery.cloud.google.com/dataset/cran-downloads-235616)  


In [1]:
import json
import requests
from operator import itemgetter
from operator import itemgetter
import helpers

# get CRAN data from Anaconda
cran_json = open('results-20190327-143800.json')
cran_data = json.load(cran_json)
cran_sorted = sorted(cran_data, key = itemgetter('download_count'), reverse=False)
cran_list = []
for pkg in cran_sorted:
    cran_list.append(pkg.get("package"))

# get anaconda r pkgs
r = requests.get("https://repo.continuum.io/pkgs/r/linux-64/repodata.json")
data = r.json()
ad_r_list = []
pkgs = data.get("packages")
for k,v in pkgs.items():
    ad_r_list.append((v.get("name").replace("r-","")))

ad_r_list = helpers.unique(ad_r_list)

# R package names may differ slightly from Anaconda package names,
# find similar package names and map them to each other
#
similarity_list = []
for r_pkg in ad_r_list:
    for cran_pkg in cran_list:
        score = helpers.similarity(r_pkg,cran_pkg)
        if score > 0.8:
            similarity_list.append({"anaconda" : r_pkg, "cran" : cran_pkg, "score" : score})
            
# scores = [list(x) for x in set(tuple(x) for x in similarity_list)]
scores = list({v["anaconda"]:v for v in similarity_list}.values())
scores = sorted(scores, key = lambda i: i["anaconda"])

anaconda_pkgs = []
for pkg in scores:
    anaconda_pkgs.append(pkg.get("cran"))

anaconda_pkgs_set = set(anaconda_pkgs)


CRAN Downloads


In [4]:
from google.cloud import bigquery
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file('/Users/sparafina/Documents/cran-downloads-cf6f22173f3d.json')
project_id = 'cran-downloads-235616'


client = bigquery.Client(credentials=credentials,project=project_id)

QUERY = ('SELECT COUNT(*) AS download_count, package '
         'FROM cran_downloads20182303.downloads ' 
         'WHERE date between "2018-03-1" AND "2019-03-01" '
         'GROUP BY package '
         'ORDER BY download_count DESC '         
         'LIMIT 1000')
query_job = client.query(QUERY)
rows = query_job.result()

# convert Google BigQuery object to lists
cran_pkgs = []
cran_pkgs_list = []
for row in rows:
    cran_pkgs.append({"download_count" :row.download_count, "package" : row.package})
    cran_pkgs_list.append(row.package)
    

cran_pkgs_set = set(cran_pkgs_list)


Compare Anaconda R packages with CRAN R packages

In [5]:
# Anaconda R packages in top 1000 CRAN R packages
shared_pkgs = anaconda_pkgs_set.intersection(cran_pkgs_set)
# CRAN packages not in Anaconda R repository
unavailable_pkgs = cran_pkgs_set.difference(anaconda_pkgs_set)
unavailable_pkgs_list = helpers.compare_packages(unavailable_pkgs, cran_pkgs) 
unavailable_pkgs_list = sorted(unavailable_pkgs_list, key = itemgetter("download_count"), reverse=True)


with open('unavailable_anaconda_r_pkgs.json', 'w') as out:
    json.dump(unavailable_pkgs_list, out)
    

Find Conda-Forge R packages


In [7]:
cf_r_pkg = []
cf_r_data = open('conda-forge-R-pkgs.txt')
for pkg in cf_r_data:
    cf_r_pkg.append(pkg.lstrip('r-').rstrip())
cf_r_pkgs_set = set(cf_r_pkg)

cf_r_shared_pkgs = cf_r_pkgs_set.intersection(cran_pkgs_set)
cf_r_unavailable = cran_pkgs_set.intersection(cf_r_pkgs_set)
cf_unavailable_pkgs_list = helpers.compare_packages(cf_r_unavailable, cran_pkgs)
cf_unavailable_pkgs = sorted(cf_unavailable_pkgs_list, key = itemgetter("download_count"), reverse=True)

with open('unavailable_conda_forge_r_pkgs.json', 'w') as out:
    json.dump(cf_unavailable_pkgs, out)

