![ga4](https://www.google-analytics.com/collect?v=2&tid=G-6VDTYWLKX6&cid=1&en=page_view&sid=1&dl=statmike%2Fvertex-ai-mlops%2Farchitectures%2Ftracking%2Fgithub&dt=GitHub+Metrics+-+1+-+Initial+Creation.ipynb)

# GitHub Statistics For /statmike/vertex-ai-mlops

Using the GitHub API for [/metrics/statistics](https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28)

TODO:
- [X] Weekly commit data
- [X] Weekly author level commit data
- [ ] Get Commit History
- [ ] Get Files from Each Commit
- [ ] Traffic and Referrer Information


---
## COLAB SETUP

To run this notebook in Colab click [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/statmike/vertex-ai-mlops/blob/main/architectures/tracking/github/GitHub%20Metrics%20-%201%20-%20Initial%20Creation.ipynb.ipynb) and run the cells in this section.  Otherwise, skip this section.

In [6]:
try:
  from google.cloud import secretmanager
except ImportError:
  !pip install google-cloud-secret-manager -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.4/100.4 KB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
try:
    import google.colab
    #!pip install --upgrade google-cloud-bigquery -q
    from google.colab import auth
    auth.authenticate_user()
    import os
    #os.kill(os.getpid(), 9)
except Exception:
    pass

In [8]:
PROJECT_ID = 'vertex-ai-mlops-369716' # replace with project ID
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


---
## Setup

In [9]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

'vertex-ai-mlops-369716'

In [57]:
REGION = 'us-central1'

github_user = 'statmike'
github_repo = 'vertex-ai-mlops'

BQ_PROJECT = PROJECT_ID
BQ_DATASET = 'github_metrics'

In [11]:
import requests
import json
import time
from datetime import datetime
import pandas as pd
from io import StringIO
import os, shutil

from google.cloud import bigquery
from google.cloud import storage
from google.cloud import secretmanager

In [13]:
bq = bigquery.Client(project = PROJECT_ID)
gcs = storage.Client(project = PROJECT_ID)
secret_client = secretmanager.SecretManagerServiceClient()

In [15]:
secret = secret_client.access_secret_version(request = {"name": f'projects/{PROJECT_ID}/secrets/github_api/versions/latest'})
pat = secret.payload.data.decode('utf-8')

## GitHub API

In [16]:
github_api_url = f'https://api.github.com/repos/{github_user}/{github_repo}'

In [19]:
def metric_get(metric_type):
  response = requests.get(f'{github_api_url}/{metric_type}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
  while response.status_code == 202:
      time.sleep(10)
      response = requests.get(f'{github_api_url}/{metric_type}', headers = {'Authorization': f'Bearer {pat}', 'Accept': 'application/vnd.github+json'})
  return response

---
## code_frequency
- stats/code_frequency
- https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#get-the-weekly-commit-activity
- Full history of repository
- schema: list of list for each week
  - 3 elements are integers: UNIX Timestamp for 12AM each Sunday, additions, deletions
- LRO may return status_code = 202 while running, do retry until 200


In [41]:
metric_type = 'stats/code_frequency'
response = metric_get(metric_type)
response.status_code

200

In [42]:
code_frequency = pd.DataFrame(json.loads(response.text), columns = ['week', 'additions', 'deletions'])
code_frequency['week'] = pd.to_datetime(code_frequency['week'], unit = 's')

In [43]:
code_frequency

Unnamed: 0,week,additions,deletions
0,2021-03-28,2983,-547
1,2021-04-04,7461,-3499
2,2021-04-11,12394,-6314
3,2021-04-18,7904,-6179
4,2021-04-25,0,0
...,...,...,...
94,2023-01-15,5367,-1536
95,2023-01-22,9295,-2389
96,2023-01-29,14897,-20539
97,2023-02-05,37346,-2162


---
## commit_activity
- stats/commit_activity
- https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#get-the-last-year-of-commit-activity
- last year of commit activity
- schema: list of dict for each week
  - week: UNIX Timestamp for 12AM each Sunday, additions, deletions
  - total: int of number of commits
  - days: list of ints with commit per day [sunday, ..., saturday]
- LRO may return status_code = 202 while running, do retry until 200


In [45]:
metric_type = 'stats/commit_activity'
response = metric_get(metric_type)
response.status_code

200

In [46]:
commit_activity = pd.DataFrame(json.loads(response.text)).drop(columns = ['days']).rename(columns = {'total' : 'commits'})
commit_activity['week'] = pd.to_datetime(commit_activity['week'], unit = 's')

In [47]:
commit_activity

Unnamed: 0,commits,week
0,2,2022-02-20
1,0,2022-02-27
2,10,2022-03-06
3,14,2022-03-13
4,6,2022-03-20
5,7,2022-03-27
6,6,2022-04-03
7,4,2022-04-10
8,2,2022-04-17
9,0,2022-04-24


---
## contributors
- stats/contributors
- https://docs.github.com/en/rest/metrics/statistics?apiVersion=2022-11-28#get-all-contributor-commit-activity
- full history of repository
- schema: list of dict for contributor
  - total: total commits all time
  - weeks: list of dicts for each week:
    - w: UNIX Timestamp for 12AM each Sunday
    - a: additions
    - d: deletions
    - c: commits
  - author: dict
    - login: user name on GitHub
    - ...
- LRO may return status_code = 202 while running, do retry until 200


In [48]:
metric_type = 'stats/contributors'
response = metric_get(metric_type)
response.status_code

200

In [60]:
parsed_response = []
for e in json.loads(response.text):
  for week in e['weeks']:
    parsed_response += [{'author': e['author']['login'], 'week': week['w'], 'additions': week['a'], 'deletions': -1*week['d'], 'commits': week['c']}]

In [61]:
parsed_response[0]

{'author': 'karticn-google',
 'week': 1616889600,
 'additions': 0,
 'deletions': 0,
 'commits': 0}

In [62]:
contributors = pd.DataFrame(parsed_response)
contributors['week'] = pd.to_datetime(contributors['week'], unit = 's')

In [63]:
contributors

Unnamed: 0,author,week,additions,deletions,commits
0,karticn-google,2021-03-28,0,0,0
1,karticn-google,2021-04-04,0,0,0
2,karticn-google,2021-04-11,0,0,0
3,karticn-google,2021-04-18,0,0,0
4,karticn-google,2021-04-25,0,0,0
...,...,...,...,...,...
391,statmike,2023-01-15,981,-380,7
392,statmike,2023-01-22,828,-450,5
393,statmike,2023-01-29,8087,-20539,13
394,statmike,2023-02-05,37346,-2162,24


---
## Pandas Tables

### weekly_commits

In [124]:
weekly_commits = pd.merge(code_frequency, commit_activity, on='week', how='outer')
weekly_commits['github_account'] = github_user
weekly_commits['github_repo'] = github_repo

### author_weekly_commits

In [127]:
author_weekly_commits = contributors
author_weekly_commits['github_account'] = github_user
author_weekly_commits['github_repo'] = github_repo

---
## BigQuery Tables: Initial Creation


### weekly_commits

In [128]:
weekly_commits_job = bq.load_table_from_dataframe(
    dataframe = weekly_commits,
    destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.weekly_commits"),
    job_config = bigquery.LoadJobConfig(
        write_disposition = 'WRITE_TRUNCATE', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
        autodetect = True, # detect schema
    )
)
weekly_commits_job.result()

LoadJob<project=vertex-ai-mlops-369716, location=US, id=5f854c9b-045f-4b25-8827-331bc5c7b40b>

### author_weekly_commits

In [129]:
author_weekly_commits_job = bq.load_table_from_dataframe(
    dataframe = author_weekly_commits,
    destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.author_weekly_commits"),
    job_config = bigquery.LoadJobConfig(
        write_disposition = 'WRITE_TRUNCATE', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
        autodetect = True, # detect schema
    )
)
author_weekly_commits_job.result()

LoadJob<project=vertex-ai-mlops-369716, location=US, id=11e2ba73-7106-4b9b-8e85-be3b6b8a783e>

## BigQuery Tables: Increment

- get last week of data: prior_weekly_commits
- keep all new records for 'last week' and newer: new_weekly_commits
- conditions:
  - if same number of rows and rows identitial: do nothing
  - elif rows are same for last week: remove it from new, if new records then append them
  - if row for last week differs: drop it from BQ, then append

### weekly_commits

In [130]:
prior_weekly_commits = bq.query(query = f"""SELECT t.* FROM `{BQ_PROJECT}.{BQ_DATASET}.weekly_commits` t WHERE 1=1 QUALIFY row_number() over(order by week desc) = 1""").to_dataframe()

In [131]:
prior_weekly_commits

Unnamed: 0,week,additions,deletions,commits,github_account,github_repo
0,2023-02-12,26391,-6586,10.0,statmike,vertex-ai-mlops


In [132]:
new_weekly_commits = weekly_commits[(weekly_commits['week'] >= prior_weekly_commits['week'].max())]

In [133]:
new_weekly_commits

Unnamed: 0,week,additions,deletions,commits,github_account,github_repo
98,2023-02-12,26391,-6586,10.0,statmike,vertex-ai-mlops


In [134]:
# check if no new weeks data added (both will have 1 record for last week)
if prior_weekly_commits.shape[0] == new_weekly_commits.shape[0]:
  # check if last weeks data changed
  if prior_weekly_commits.values.tolist() != new_weekly_commits.values.tolist():
    # remove old week from BQ
    job = bq.query(query = f"""DELETE FROM `{BQ_PROJECT}.{BQ_DATASET}.weekly_commits` WHERE week = '{prior_weekly_commits['week'].max()}'""")
    job.result()
    # append updated week
    new_weekly_commits_job = bq.load_table_from_dataframe(
        dataframe = new_weekly_commits,
        destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.weekly_commits"),
        job_config = bigquery.LoadJobConfig(
            write_disposition = 'WRITE_APPEND', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
            autodetect = True, # detect schema
        )
    )
    new_weekly_commits_job.result()
# if new weeks data has been added (could even be more than one):
else:
  # check if overlapping week changed
  if prior_weekly_commits.values.tolist() != new_weekly_commits[(new_weekly_commits['week'] == prior_weekly_commits['week'].max())].tolist():
    # remove old week from BQ
    job = bq.query(query = f"""DELETE FROM `{BQ_PROJECT}.{BQ_DATASET}.weekly_commits` WHERE week = '{prior_weekly_commits['week'].max()}'""")
    job.result()
    # append all: updated and new
    new_weekly_commits_job = bq.load_table_from_dataframe(
        dataframe = new_weekly_commits,
        destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.weekly_commits"),
        job_config = bigquery.LoadJobConfig(
            write_disposition = 'WRITE_APPEND', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
            autodetect = True, # detect schema
        )
    )
    new_weekly_commits_job.result()
  else:
    # remove old week from new_weekly_commits
    new_weekly_commits = new_weekly_commits[(new_weekly_commits['week'] != prior_weekly_commits['week'].max())]
    # append new week(s)
    new_weekly_commits_job = bq.load_table_from_dataframe(
        dataframe = new_weekly_commits,
        destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.weekly_commits"),
        job_config = bigquery.LoadJobConfig(
            write_disposition = 'WRITE_APPEND', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
            autodetect = True, # detect schema
        )
    )
    new_weekly_commits_job.result()

### author_weekly_commits

In [135]:
prior_author_weekly_commits = bq.query(query = f"""SELECT t.* FROM `{BQ_PROJECT}.{BQ_DATASET}.author_weekly_commits` t WHERE 1=1 QUALIFY row_number() over(partition by author order by week desc) = 1""").to_dataframe().sort_values(by = ['week', 'author'])

In [136]:
prior_author_weekly_commits

Unnamed: 0,author,week,additions,deletions,commits,github_account,github_repo
3,PavelPetukhov,2023-02-12,0,0,0,statmike,vertex-ai-mlops
2,goodrules,2023-02-12,0,0,0,statmike,vertex-ai-mlops
0,karticn-google,2023-02-12,0,0,0,statmike,vertex-ai-mlops
1,statmike,2023-02-12,26391,-6586,10,statmike,vertex-ai-mlops


In [137]:
new_author_weekly_commits = author_weekly_commits[(author_weekly_commits['week'] >= prior_author_weekly_commits['week'].max())].sort_values(by = ['week', 'author'])

In [138]:
new_author_weekly_commits

Unnamed: 0,author,week,additions,deletions,commits,github_account,github_repo
197,PavelPetukhov,2023-02-12,0,0,0,statmike,vertex-ai-mlops
296,goodrules,2023-02-12,0,0,0,statmike,vertex-ai-mlops
98,karticn-google,2023-02-12,0,0,0,statmike,vertex-ai-mlops
395,statmike,2023-02-12,26391,-6586,10,statmike,vertex-ai-mlops


In [139]:
# check if no new weeks data added (both will have 1 record for last week)
if prior_author_weekly_commits.shape[0] == new_author_weekly_commits.shape[0]:
  # check if last weeks data changed
  if prior_author_weekly_commits.values.tolist() != new_author_weekly_commits.values.tolist():
    # remove old week from BQ
    job = bq.query(query = f"""DELETE FROM `{BQ_PROJECT}.{BQ_DATASET}.weekly_commits` WHERE week = '{prior_author_weekly_commits['week'].max()}'""")
    job.result()
    # append updated week
    new_author_weekly_commits_job = bq.load_table_from_dataframe(
        dataframe = new_author_weekly_commits,
        destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.weekly_commits"),
        job_config = bigquery.LoadJobConfig(
            write_disposition = 'WRITE_APPEND', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
            autodetect = True, # detect schema
        )
    )
    new_author_weekly_commits_job.result()
# if new weeks data has been added (could even be more than one):
else:
  # check if overlapping week changed
  if prior_author_weekly_commits.values.tolist() != new_author_weekly_commits[(new_author_weekly_commits['week'] == prior_author_weekly_commits['week'].max())].tolist():
    # remove old week from BQ
    job = bq.query(query = f"""DELETE FROM `{BQ_PROJECT}.{BQ_DATASET}.weekly_commits` WHERE week = '{prior_author_weekly_commits['week'].max()}'""")
    job.result()
    # append all: updated and new
    new_author_weekly_commits_job = bq.load_table_from_dataframe(
        dataframe = new_author_weekly_commits,
        destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.weekly_commits"),
        job_config = bigquery.LoadJobConfig(
            write_disposition = 'WRITE_APPEND', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
            autodetect = True, # detect schema
        )
    )
    new_author_weekly_commits_job.result()
  else:
    # remove old week from new_author_weekly_commits
    new_author_weekly_commits = new_author_weekly_commits[(new_author_weekly_commits['week'] != prior_author_weekly_commits['week'].max())]
    # append new week(s)
    new_author_weekly_commits_job = bq.load_table_from_dataframe(
        dataframe = new_author_weekly_commits,
        destination = bigquery.TableReference.from_string(f"{BQ_PROJECT}.{BQ_DATASET}.weekly_commits"),
        job_config = bigquery.LoadJobConfig(
            write_disposition = 'WRITE_APPEND', # WRITE_TRUNCATE = replace if exists, WRITE_APPEND = append if exists, WRITE_EMPTY = write new but dont overwrite
            autodetect = True, # detect schema
        )
    )
    new_author_weekly_commits_job.result()