Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
name: Daily Collection

on:
workflow_dispatch:
schedule:
- cron: '0 0 * * *'

jobs:
collect:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/dryrun.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ jobs:
collect:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
23 changes: 23 additions & 0 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Style Checks

on:
push:
pull_request:
types: [opened, reopened]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.13
uses: actions/setup-python@v5
with:
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install invoke .[dev]
- name: Run lint checks
run: invoke lint

6 changes: 3 additions & 3 deletions .github/workflows/manual.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ jobs:
collect:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
uses: actions/setup-python@v5
with:
python-version: 3.8
python-version: '3.13'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,15 @@ install: clean-build clean-pyc ## install the package to the active Python's sit

.PHONY: install-develop
install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development
pip install -r dev-requirements.txt -e .
pip install -e .[dev]


# LINT TARGETS

.PHONY: lint
lint: ## check style with flake8 and isort
invoke lint

.PHONY: fix-lint
fix-lint:
invoke fix-lint
26 changes: 0 additions & 26 deletions dev-requirements.txt

This file was deleted.

83 changes: 60 additions & 23 deletions download_analytics/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,21 @@ def _valid_date(arg):
def _get_parser():
# Logging
logging_args = argparse.ArgumentParser(add_help=False)
logging_args.add_argument('-v', '--verbose', action='count', default=0,
help='Be verbose. Use `-vv` for increased verbosity.')
logging_args.add_argument('-l', '--logfile',
help='If given, file where the logs will be written.')
logging_args.add_argument(
'-v',
'--verbose',
action='count',
default=0,
help='Be verbose. Use `-vv` for increased verbosity.',
)
logging_args.add_argument(
'-l', '--logfile', help='If given, file where the logs will be written.'
)

parser = argparse.ArgumentParser(
prog='download-analytics',
description='Download Analytics Command Line Interface',
parents=[logging_args]
parents=[logging_args],
)
parser.set_defaults(action=None)
action = parser.add_subparsers(title='action')
Expand All @@ -90,36 +96,67 @@ def _get_parser():
collect.set_defaults(action=_collect)

collect.add_argument(
'-o', '--output-folder', type=str, required=False,
'-o',
'--output-folder',
type=str,
required=False,
help=(
'Path to the folder where data will be stored. It can be a local path or a'
' Google Drive folder path in the format gdrive://<folder-id>'
)
),
)
collect.add_argument(
'-a', '--authentication-credentials', type=str, required=False,
help='Path to the GCP (BigQuery) credentials file to use.')
'-a',
'--authentication-credentials',
type=str,
required=False,
help='Path to the GCP (BigQuery) credentials file to use.',
)
collect.add_argument(
'-c', '--config-file', type=str, default='config.yaml',
help='Path to the configuration file.')
'-c',
'--config-file',
type=str,
default='config.yaml',
help='Path to the configuration file.',
)
collect.add_argument(
'-p', '--projects', nargs='*',
help='List of projects to collect. If not given use the configured ones.')
'-p',
'--projects',
nargs='*',
help='List of projects to collect. If not given use the configured ones.',
)
collect.add_argument(
'-s', '--start-date', type=_valid_date, required=False,
help='Date from which to start pulling data.')
'-s',
'--start-date',
type=_valid_date,
required=False,
help='Date from which to start pulling data.',
)
collect.add_argument(
'-m', '--max-days', type=int, required=False,
help='Max days of data to pull if start-date is not given.')
'-m',
'--max-days',
type=int,
required=False,
help='Max days of data to pull if start-date is not given.',
)
collect.add_argument(
'-d', '--dry-run', action='store_true',
help='Do not run the actual query, only simulate it.')
'-d',
'--dry-run',
action='store_true',
help='Do not run the actual query, only simulate it.',
)
collect.add_argument(
'-f', '--force', action='store_true',
help='Force the download even if the data already exists or there is a gap')
'-f',
'--force',
action='store_true',
help='Force the download even if the data already exists or there is a gap',
)
collect.add_argument(
'-M', '--add-metrics', action='store_true',
help='Compute the aggregation metrics and create the corresponding spreadsheets.')
'-M',
'--add-metrics',
action='store_true',
help='Compute the aggregation metrics and create the corresponding spreadsheets.',
)

return parser

Expand Down
11 changes: 7 additions & 4 deletions download_analytics/bq.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ def _get_bq_client(credentials_file):
scopes=['https://www.googleapis.com/auth/cloud-platform'],
)

return bigquery.Client(credentials=credentials, project=credentials.project_id,)
return bigquery.Client(
credentials=credentials,
project=credentials.project_id,
)


def run_query(query, dry_run=False, credentials_file=None):
Expand All @@ -41,14 +44,14 @@ def run_query(query, dry_run=False, credentials_file=None):

job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
dry_run_job = client.query(query, job_config=job_config)
LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024 ** 3)
LOGGER.info('Estimated processed GBs: %.2f', dry_run_job.total_bytes_processed / 1024**3)

if dry_run:
return None

query_job = client.query(query)
data = query_job.to_dataframe()
LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024 ** 3)
LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024 ** 3)
LOGGER.info('Total processed GBs: %.2f', query_job.total_bytes_processed / 1024**3)
LOGGER.info('Total billed GBs: %.2f', query_job.total_bytes_billed / 1024**3)

return data
6 changes: 1 addition & 5 deletions download_analytics/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,7 @@ def upload(content, filename, folder, convert=False):
except FileNotFoundError:
file_config = {
'title': filename,
'parents': [
{
'id': folder
}
],
'parents': [{'id': folder}],
}
drive_file = drive.CreateFile(file_config)

Expand Down
14 changes: 11 additions & 3 deletions download_analytics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,16 @@
LOGGER = logging.getLogger(__name__)


def collect_downloads(projects, output_folder, start_date=None, max_days=1, credentials_file=None,
dry_run=False, force=False, add_metrics=True):
def collect_downloads(
projects,
output_folder,
start_date=None,
max_days=1,
credentials_file=None,
dry_run=False,
force=False,
add_metrics=True,
):
"""Pull data about the downloads of a list of projects.

Args:
Expand Down Expand Up @@ -49,7 +57,7 @@ def collect_downloads(projects, output_folder, start_date=None, max_days=1, cred
max_days=max_days,
credentials_file=credentials_file,
dry_run=dry_run,
force=force
force=force,
)

if pypi_downloads.empty:
Expand Down
19 changes: 8 additions & 11 deletions download_analytics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import logging
import re

import pandas as pd

from download_analytics.output import create_spreadsheet

LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -35,24 +37,21 @@ def _historical_groupby(downloads, groupbys=None):

for groupby in groupbys:
grouped = downloads.groupby([year_month, groupby])
grouped_sizes = grouped.size().unstack(-1)
grouped_sizes = grouped.size().unstack(-1) # noqa: PD010
if len(groupbys) > 1:
grouped_sizes.columns = f"{groupby}='" + grouped_sizes.columns + "'"

base[grouped_sizes.columns] = grouped_sizes.fillna(0)

totals = base.sum()
totals.name = 'total'
base = base.append(totals)
base = pd.concat([base, totals], ignore_index=True)

return base.reset_index().iloc[::-1]


def _get_sheet_name(column):
words = [
f'{word[0].upper()}{word[1:]}'
for word in column.split('_')
]
words = [f'{word[0].upper()}{word[1:]}' for word in column.split('_')]
return ' '.join(['By'] + words)


Expand Down Expand Up @@ -121,7 +120,7 @@ def _version_element_order_key(version):
# while it shouldn't enter the `if`.
pass

components.append(last_component[len(last_numeric):])
components.append(last_component[len(last_numeric) :])

return components

Expand All @@ -133,7 +132,7 @@ def _version_order_key(version_column):
def _mangle_columns(downloads):
downloads = downloads.rename(columns=RENAME_COLUMNS)
downloads['full_python_version'] = downloads['python_version']
downloads['python_version'] = downloads['python_version'].str.rsplit('.', 1).str[0]
downloads['python_version'] = downloads['python_version'].str.rsplit('.', n=1).str[0]
downloads['project_version'] = downloads['project'] + '-' + downloads['version']
downloads['distro_version'] = downloads['distro_name'] + ' ' + downloads['distro_version']
downloads['distro_kernel'] = downloads['distro_version'] + ' - ' + downloads['distro_kernel']
Expand All @@ -150,9 +149,7 @@ def compute_metrics(downloads, output_path=None):
downloads = _mangle_columns(downloads)

LOGGER.debug('Aggregating by month')
sheets = {
'By Month': _by_month(downloads)
}
sheets = {'By Month': _by_month(downloads)}

for column in GROUPBY_COLUMNS:
name = _get_sheet_name(column)
Expand Down
Loading