![ga4](https://www.google-analytics.com/collect?v=2&tid=G-6VDTYWLKX6&cid=1&en=page_view&sid=1&dl=statmike%2Fvertex-ai-mlops%2Farchitectures%2Ftracking%2Fsetup%2Fgithub&dt=GitHub+Metrics+-+3+-+Reporting+Scheduled+Query.ipynb)

# GitHub Metrics: BQ Scheduled Query For Reporting

https://cloud.google.com/bigquery/docs/scheduling-queries#set_up_scheduled_queries



---
## COLAB SETUP

To run this notebook in Colab click [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/statmike/vertex-ai-mlops/blob/main/architectures/tracking/setup/github/GitHub%20Metrics%20-%203%20-%20Reporting%20Scheduled%20Query.ipynb) and run the cells in this section.  Otherwise, skip this section.

In [1]:
try:
  from google.cloud import bigquery_datatransfer
except ImportError:
  !pip install google-cloud-bigquery-datatransfer -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.2/62.2 KB[0m [31m624.1 kB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [2]:
try:
    import google.colab
    #!pip install --upgrade google-cloud-bigquery -q
    from google.colab import auth
    auth.authenticate_user()
    import os
    #os.kill(os.getpid(), 9)
except Exception:
    pass

In [3]:
PROJECT_ID = 'vertex-ai-mlops-369716' # replace with project ID
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


---
## Setup

In [32]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'vertex-ai-mlops-369716'

In [33]:
REGION = 'us-central1'

BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'reporting'

In [6]:
import requests
import json
import time
from datetime import datetime
import pandas as pd
from io import StringIO
import os, shutil

from google.cloud import bigquery
from google.cloud import bigquery_datatransfer

In [34]:
bq = bigquery.Client(project = PROJECT_ID)
transfer_client = bigquery_datatransfer.DataTransferServiceClient()

## BQ Query Design

In [9]:
query = f"""
WITH
  # calculate contributions not attributed to author and assign to statmike
  RESIDUALS AS (
    SELECT 'statmike' as author, week, 
      overall.additions - authors.additions AS additions,
      overall.deletions - authors.deletions as deletions,
      overall.commits - authors.commits as commits,
      github_account, github_repo
    FROM (SELECT * FROM `vertex-ai-mlops-369716.github_metrics.weekly_commits`) AS overall
    JOIN (SELECT week, sum(additions) as additions, sum(deletions) as deletions, sum(commits) as commits
        FROM `vertex-ai-mlops-369716.github_metrics.author_weekly_commits`
        GROUP BY week) AS authors
    USING(week)
    WHERE overall.additions != authors.additions
      OR overall.deletions != authors.deletions
      OR overall.commits != authors.commits),
  # combine attributions back with authors
  COMBINED AS (
    SELECT * FROM RESIDUALS
    UNION ALL
    SELECT * FROM `vertex-ai-mlops-369716.github_metrics.author_weekly_commits`)
# combine attributions for author statmike
SELECT author, week, SUM(additions) as additions, SUM(deletions) as deletions, sum(commits) as commits, github_account, github_repo
FROM COMBINED
GROUP BY author, week, github_account, github_repo
"""
print(query)


WITH
  # calculate contributions not attributed to author and assign to statmike
  RESIDUALS AS (
    SELECT 'statmike' as author, week, 
      overall.additions - authors.additions AS additions,
      overall.deletions - authors.deletions as deletions,
      overall.commits - authors.commits as commits,
      github_account, github_repo
    FROM (SELECT * FROM `vertex-ai-mlops-369716.github_metrics.weekly_commits`) AS overall
    JOIN (SELECT week, sum(additions) as additions, sum(deletions) as deletions, sum(commits) as commits
        FROM `vertex-ai-mlops-369716.github_metrics.author_weekly_commits`
        GROUP BY week) AS authors
    USING(week)
    WHERE overall.additions != authors.additions
      OR overall.deletions != authors.deletions
      OR overall.commits != authors.commits),
  # combine attributions back with authors
  COMBINED AS (
    SELECT * FROM RESIDUALS
    UNION ALL
    SELECT * FROM `vertex-ai-mlops-369716.github_metrics.author_weekly_commits`)
# combine att

In [10]:
bq.query(query = query).to_dataframe()

Unnamed: 0,author,week,additions,deletions,commits,github_account,github_repo
0,karticn-google,2021-03-28,0,0,0.0,statmike,vertex-ai-mlops
1,karticn-google,2021-04-04,0,0,0.0,statmike,vertex-ai-mlops
2,karticn-google,2021-04-11,0,0,0.0,statmike,vertex-ai-mlops
3,karticn-google,2021-04-18,0,0,0.0,statmike,vertex-ai-mlops
4,karticn-google,2021-04-25,0,0,0.0,statmike,vertex-ai-mlops
...,...,...,...,...,...,...,...
391,statmike,2023-02-05,37346,-2162,24.0,statmike,vertex-ai-mlops
392,statmike,2022-09-11,67894,-65551,27.0,statmike,vertex-ai-mlops
393,statmike,2022-10-02,405573,-389656,29.0,statmike,vertex-ai-mlops
394,statmike,2022-09-18,13432,-4092,34.0,statmike,vertex-ai-mlops


## BigQuery Initial Table Creation

In [12]:
job = bq.query(query = f"""CREATE OR REPLACE TABLE `{BQ_PROJECT}.{BQ_DATASET}.github_contributions` AS {query}""")
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f0dc19ecf10>

## BigQuery Incremental Update

In [18]:
increment_query = f"""
DELETE FROM `{BQ_PROJECT}.{BQ_DATASET}.github_contributions` WHERE week >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 WEEK);
INSERT INTO `{BQ_PROJECT}.{BQ_DATASET}.github_contributions`
  WITH
    # calculate contributions not attributed to author and assign to statmike
    RESIDUALS AS (
      SELECT 'statmike' as author, week, 
        overall.additions - authors.additions AS additions,
        overall.deletions - authors.deletions as deletions,
        overall.commits - authors.commits as commits,
        github_account, github_repo
      FROM (SELECT *
        FROM `vertex-ai-mlops-369716.github_metrics.weekly_commits`
        WHERE week >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 WEEK)) AS overall
      JOIN (SELECT week, sum(additions) as additions, sum(deletions) as deletions, sum(commits) as commits
          FROM `vertex-ai-mlops-369716.github_metrics.author_weekly_commits`
          WHERE week >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 WEEK)
          GROUP BY week) AS authors
      USING(week)
      WHERE overall.additions != authors.additions
        OR overall.deletions != authors.deletions
        OR overall.commits != authors.commits),
    # combine attributions back with authors
    COMBINED AS (
      SELECT * FROM RESIDUALS
      UNION ALL
      SELECT *
        FROM `vertex-ai-mlops-369716.github_metrics.author_weekly_commits`
        WHERE week >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 WEEK))
  # combine attributions for author statmike
  SELECT author, week, SUM(additions) as additions, SUM(deletions) as deletions, sum(commits) as commits, github_account, github_repo
  FROM COMBINED
  GROUP BY author, week, github_account, github_repo;
"""
print(increment_query)


DELETE FROM `vertex-ai-mlops-369716.reporting.github_contributions` WHERE week >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 WEEK);
INSERT INTO `vertex-ai-mlops-369716.reporting.github_contributions`
  WITH
    # calculate contributions not attributed to author and assign to statmike
    RESIDUALS AS (
      SELECT 'statmike' as author, week, 
        overall.additions - authors.additions AS additions,
        overall.deletions - authors.deletions as deletions,
        overall.commits - authors.commits as commits,
        github_account, github_repo
      FROM (SELECT *
        FROM `vertex-ai-mlops-369716.github_metrics.weekly_commits`
        WHERE week >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 WEEK)) AS overall
      JOIN (SELECT week, sum(additions) as additions, sum(deletions) as deletions, sum(commits) as commits
          FROM `vertex-ai-mlops-369716.github_metrics.author_weekly_commits`
          WHERE week >= DATE_SUB(CURRENT_DATE(), INTERVAL 1 WEEK)
          GROUP BY week) AS authors

In [19]:
job = bq.query(query = increment_query)
job.result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7f0dc1957460>

## Schedule BigQuery Query

**FOR NOW - SETUP IN CONSOLE USING QUERY ABOVE**


In [26]:
transfer_config = bigquery_datatransfer.TransferConfig(
    display_name = "github_contributions - update monday",
    data_source_id="scheduled_query",
    params={
        "query": increment_query
    },
    schedule="every mon 08:00",
    dataset_region = 'us'
)

In [None]:
transfer_config = transfer_client.create_transfer_config(
    bigquery_datatransfer.CreateTransferConfigRequest(
        parent = 'projects/807305962454/locations/us',
        transfer_config = transfer_config
    )
)

In [20]:
parent = transfer_client.common_project_path(BQ_PROJECT)
parent

'projects/vertex-ai-mlops-369716'