In [None]:
import dask.dataframe as dd
import intake
import pandas as pd
import hvplot.pandas
from cranlogs import cran_downloads

### Python package - Conda data

In [None]:
cat = intake.open_catalog('https://raw.githubusercontent.com/ContinuumIO/anaconda-package-data/master/catalog/anaconda_package_data.yaml')
df2018 = cat.anaconda_package_data_by_year(year=2018).to_dask()
df2019 = cat.anaconda_package_data_by_year(year=2019).to_dask()
dfconda = df2018.append(df2019)
dfconda['month'] = dfconda.time.dt.month
dfconda['year'] = dfconda.time.dt.year


In [None]:
dfconda.tail()

In [None]:
dfconda = dfconda\
    .loc[(dfconda.pkg_name.isin(['pandas','scikit-learn','statsmodels','xgboost','keras','tensorflow',
                                'matplotlib','bokeh','networkx','panel','voila','plotly']))]\
    .groupby(['year','month','pkg_name'])\
    .sum()\
    .reset_index()\
    .compute()
dfconda = dfconda.loc[dfconda.counts>0]
dfconda['Month'] = dfconda[['year','month']].apply(lambda x : '{}-{:02d}-01'.format(x[0],x[1]), axis=1)
dfconda = dfconda.reset_index(drop=True)

In [None]:
#dfconda.to_csv('conda.csv',index=False)
dfconda = pd.read_csv('conda.csv')
dfconda['Month'] = pd.to_datetime(dfconda.Month)

In [None]:
dfconda.head()

In [None]:
# dfconda['pkg_name'] = 'Conda-'+dfconda['pkg_name']
# dfconda_wide = dfconda.pivot(index='Month', columns='pkg_name', values='counts')

### Python package - PyPI data

PyPI data is published in Google Bigquery (https://packaging.python.org/guides/analyzing-pypi-package-downloads/), I used the following queries to query the data. 

In [None]:
dfpypi = pd.read_csv('pypi.csv')
dfpypi['Month'] = pd.to_datetime(dfpypi.month, format="%Y%m")

In [None]:
dfpypi.head()

In [None]:
# dfpypi['pkg_name'] = 'PyPI-'+dfpypi['pkg_name']
# dfpypi_wide = dfpypi.pivot(index='Month', columns='pkg_name', values='counts')

### Merge conda and PyPI 

In [None]:
dfconda.head()

In [None]:
dfpypi.head()

In [None]:
len(dfconda), len(dfpypi)

In [None]:
dfpython = (
    dfconda
    .drop(columns=['year','month'])
    .rename(columns={'counts':'conda_counts'})
    .merge(dfpypi
           .drop(columns=['month'])
           .rename(columns={'counts':'pypi_counts'}),
           on=['pkg_name','Month'], 
           how='outer')
)

In [None]:
dfpython['counts'] = dfpython['conda_counts'] + dfpython['pypi_counts']
dfpython['pkg_name'] = 'Python-'+dfpython['pkg_name']

In [None]:
dfpython.head()

In [None]:
len(dfpython)

In [None]:
dfpython_wide = dfpython.pivot(index='Month', columns='pkg_name', values='counts')

### R package - CRAN data

In [None]:
dfcran = cran_downloads(['dplyr','tidyverse','e1071','caret','keras','tensorflow','ggplot2','shiny','data.table'], 
                        when='', 
                        start="2018-01-01", 
                        end="2019-07-31")


In [None]:
dfcran['date'] = dfcran['date'].astype('datetime64[ns]') 
dfcran['Month'] = dfcran.date.dt.strftime('%Y-%m-01')
dfcran = dfcran\
    .groupby(['Month','package'])\
    .sum()\
    .reset_index()


In [None]:
#dfcran.to_csv('dfcran.csv',index=False)
dfcran = pd.read_csv('dfcran.csv')
dfcran['Month'] = pd.to_datetime(dfcran.Month)


In [None]:
dfcran['package'] = 'R-'+dfcran['package']

dfcran_wide = dfcran.pivot(index='Month', columns='package', values='count')

### merge data and plotting

In [None]:
df = dfpython_wide.merge(dfcran_wide, on='Month')

In [None]:
df.hvplot(x='Month',
          y=['Python-pandas','R-dplyr','R-tidyverse','R-data.table'],
          line_color=['#1E90FF','#FFA07A','#CD5C5C','#8B0000'],
          ylabel='counts'
         )


In [None]:
df.hvplot(x='Month',
          y=['Python-scikit-learn','Python-statsmodels','R-caret','R-e1071'],
          line_color=['#1E90FF','#87CEFA','#FF4500','#FFA500'],
          ylabel='counts'
         )


In [None]:
df.hvplot(x='Month',
          y=['Python-keras','Python-tensorflow','R-keras','R-tensorflow'], 
          line_color=['#87CEFA','#1E90FF','#FF4500','#FFA500'],
          ylabel='counts'
         )


In [None]:
df.hvplot(x='Month',
          y=['Python-matplotlib','Python-plotly','Python-bokeh','R-ggplot2'],
          line_color=['#1E90FF','#87CEFA','#B0E0E6','#FF7F50'],
          ylabel='counts'
         )


In [None]:
df.hvplot(x='Month',
          y=['Python-panel','Python-voila','R-shiny'], 
          line_color=['#1E90FF','#87CEFA','#FF7F50'],
          ylabel='counts'
         )
