In [1]:
import boto3

In [2]:
import os
os.environ.setdefault('AWS_PROFILE', 'ghactivity')

'ghactivity'

In [3]:
def get_job_details(job_name):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('jobs')
    job_details = table.get_item(Key={'job_id': job_name})['Item']
    return job_details

In [4]:
job_details = get_job_details('ghactivity_ingest')
job_details

{'job_description': 'Ingest ghactivity data to s3',
 'is_active': 'Y',
 'job_id': 'ghactivity_ingest',
 'baseline_days': Decimal('3')}

In [5]:
baseline_days = job_details['baseline_days']
baseline_days

Decimal('3')

In [6]:
from datetime import datetime as dt

In [7]:
from datetime import timedelta as td

In [8]:
start_time = dt.now().date() - td(days=int(baseline_days))
start_time

datetime.date(2022, 6, 5)

In [9]:
start_file = f"{dt.strftime(start_time, '%Y-%m-%d')}-0.json.gz"

In [10]:
start_file

'2022-06-05-0.json.gz'

In [11]:
job_details = {
    'job_id': 'ghactivity_ingest',
    'job_description': 'Ingest ghactivity data to s3',
    'is_active': 'Y',
    'baseline_days': 3,
    'job_run_bookmark_details': {
        'last_run_file_name': start_file,
    }
}

In [12]:
dynamodb = boto3.resource('dynamodb')

In [13]:
table = dynamodb.Table('jobs')

In [14]:
table.put_item(Item=job_details)

{'ResponseMetadata': {'RequestId': 'FP3KCO881COGKQARCKP9EIQ7O3VV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Tue, 07 Jun 2022 23:03:30 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '2',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'FP3KCO881COGKQARCKP9EIQ7O3VV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2745614147'},
  'RetryAttempts': 0}}

In [15]:
table.get_item(Key={'job_id': 'ghactivity_ingest'})

{'Item': {'job_description': 'Ingest ghactivity data to s3',
  'is_active': 'Y',
  'job_id': 'ghactivity_ingest',
  'baseline_days': Decimal('3'),
  'job_run_bookmark_details': {'last_run_file_name': '2022-06-05-0.json.gz'}},
 'ResponseMetadata': {'RequestId': '5LM46Q438QJBF9GVIILTV1H0AJVV4KQNSO5AEMVJF66Q9ASUAAJG',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'server': 'Server',
   'date': 'Tue, 07 Jun 2022 23:03:38 GMT',
   'content-type': 'application/x-amz-json-1.0',
   'content-length': '233',
   'connection': 'keep-alive',
   'x-amzn-requestid': '5LM46Q438QJBF9GVIILTV1H0AJVV4KQNSO5AEMVJF66Q9ASUAAJG',
   'x-amz-crc32': '2433398915'},
  'RetryAttempts': 0}}

In [16]:
job_run_bookmark_details = table. \
    get_item(Key={'job_id': 'ghactivity_ingest'})['Item']['job_run_bookmark_details']

In [17]:
job_run_bookmark_details

{'last_run_file_name': '2022-06-05-0.json.gz'}

In [18]:
dt_part = job_run_bookmark_details['last_run_file_name'].split('.')[0]

In [19]:
dt_part

'2022-06-05-0'

In [21]:
next_file = f"{dt.strftime(dt.strptime(dt_part, '%Y-%m-%d-%H') + td(hours=1), '%Y-%m-%d-%-H')}.json.gz"

In [22]:
next_file

'2022-06-05-1.json.gz'

In [23]:
import requests

In [24]:
res = requests.get(f'https://data.gharchive.org/{next_file}')

In [29]:
file = open(f'data/{next_file}', 'wb')

In [30]:
file.write(res.content)

39078924

In [31]:
file.close()

In [32]:
!ls -ltr data/

total 223936
-rw-r--r--  1 itversity  staff       127 Jun  7 10:46 2022-06-07-8.json.gz
-rw-r--r--  1 itversity  staff  41575819 Jun  7 11:02 2022-06-05-8.json.gz
-rw-r--r--  1 itversity  staff  33989636 Jun  8 04:22 2022-06-05-0.json.gz
-rw-r--r--  1 itversity  staff  39078924 Jun  8 04:44 2022-06-05-1.json.gz


In [33]:
import pandas as pd

In [34]:
df = pd.read_json(f'data/{next_file}', lines=True, orient='records')

In [35]:
df

Unnamed: 0,id,type,actor,repo,payload,public,created_at,org
0,22164283627,PushEvent,"{'id': 8517910, 'login': 'LombiqBot', 'display...","{'id': 86929735, 'name': 'Lombiq/Orchard', 'ur...","{'push_id': 10072883326, 'size': 0, 'distinct_...",True,2022-06-05 01:00:00+00:00,"{'id': 8158177, 'login': 'Lombiq', 'gravatar_i..."
1,22164283628,PushEvent,"{'id': 61752841, 'login': 'itzomen', 'display_...","{'id': 317517378, 'name': 'itzomen/itzomen', '...","{'push_id': 10072883324, 'size': 1, 'distinct_...",True,2022-06-05 01:00:00+00:00,
2,22164283631,PullRequestEvent,"{'id': 16241795, 'login': 'chethanuk-plutoflum...","{'id': 33884891, 'name': 'apache/airflow', 'ur...","{'action': 'opened', 'number': 24212, 'pull_re...",True,2022-06-05 01:00:00+00:00,"{'id': 47359, 'login': 'apache', 'gravatar_id'..."
3,22164283637,CreateEvent,"{'id': 106893877, 'login': 'o001k', 'display_l...","{'id': 499969746, 'name': 'o001k/o001k', 'url'...","{'ref': 'main', 'ref_type': 'branch', 'master_...",True,2022-06-05 01:00:00+00:00,
4,22164283640,PushEvent,"{'id': 29139614, 'login': 'renovate[bot]', 'di...","{'id': 420086962, 'name': 'AR10Dev/solid-tailw...","{'push_id': 10072883335, 'size': 2, 'distinct_...",True,2022-06-05 01:00:00+00:00,
...,...,...,...,...,...,...,...,...
89816,22164505936,PushEvent,"{'id': 10810283, 'login': 'direwolf-github', '...","{'id': 499977687, 'name': 'direwolf-github/eph...","{'push_id': 10073038554, 'size': 1, 'distinct_...",True,2022-06-05 01:59:59+00:00,
89817,22164505937,CreateEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 454846486, 'name': 'avik-pal/FastDEQ.jl...",{'ref': 'compathelper/new_version/2022-06-05-0...,True,2022-06-05 01:59:59+00:00,
89818,22164505938,PushEvent,"{'id': 41898282, 'login': 'github-actions[bot]...","{'id': 420744186, 'name': 'WelderBM/WelderBM',...","{'push_id': 10073038553, 'size': 1, 'distinct_...",True,2022-06-05 01:59:59+00:00,
89819,22164505940,PushEvent,"{'id': 66223278, 'login': 'Jordanseggers', 'di...","{'id': 499976889, 'name': 'Capstone-May22-Team...","{'push_id': 10073038558, 'size': 1, 'distinct_...",True,2022-06-05 01:59:59+00:00,"{'id': 106623008, 'login': 'Capstone-May22-Tea..."


In [36]:
df.columns

Index(['id', 'type', 'actor', 'repo', 'payload', 'public', 'created_at',
       'org'],
      dtype='object')

In [37]:
df.dtypes

id                          int64
type                       object
actor                      object
repo                       object
payload                    object
public                       bool
created_at    datetime64[ns, UTC]
org                        object
dtype: object

In [38]:
df.shape

(89821, 8)