In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import datetime as dt
import time
import jinja2
from jinja2 import Environment, BaseLoader
import athena

In [2]:
sql_template = """
UNLOAD (
SELECT 
    u.machine_id,
    u.url_idc, 
    u.person_id, 
    u.ss2k, 
    u.time_id, 
    u.domain_name, 
    u.url_host, 
    u.url_dir, 
    u.url_page, 
    u.url_refer_domain, 
    u.url_refer_host, 
    u.url_refer_dir, 
    u.url_refer_page, 
    u.mimetype, 
    u.http_rc, 
    u.keywords, 
    u.html_title, 
    u.pattern_id
FROM "comscore"."url" as u
WHERE 
    u.year='{{year}}'
    and u.month='{{month}}'
    and u.day='{{day}}'
    and (u.domain_name LIKE '%twitch%')
) 
TO 's3://kdc-comscore/parquet-extracts/url_{{domain}}/year={{year}}/month={{month}}/day={{day}}' 
WITH ( format = 'parquet', compression = 'snappy' )
"""

template = Environment(loader=BaseLoader()).from_string(sql_template)

In [3]:
%%time

results = athena.run_daily(
    "kdc-admin",
    "us-east-2",
    template,
    start_date = dt.datetime(2019, 1, 1),
    end_date = dt.datetime(2019, 1, 2),
    database = "comscore",
    args = {
    },
    batch_size = 50
)
print(f"number of results: {len(results)}")

number of results: 2
CPU times: user 358 ms, sys: 83.6 ms, total: 441 ms
Wall time: 30.6 s


In [4]:
! aws s3 sync s3://kdc-comscore/parquet-extracts ./output/parquet --profile kdc-admin

download: s3://kdc-comscore/parquet-extracts/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_271e412b-4cd5-46b5-b135-15427c82ab1d to output/parquet/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_271e412b-4cd5-46b5-b135-15427c82ab1d
download: s3://kdc-comscore/parquet-extracts/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_39108a85-004f-4d03-9ef2-05f546367edf to output/parquet/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_39108a85-004f-4d03-9ef2-05f546367edf
download: s3://kdc-comscore/parquet-extracts/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_cb1fb8d5-12a6-400d-8d8c-14a1c49f6c7c to output/parquet/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_cb1fb8d5-12a6-400d-8d8c-14a1c49f6c7c
download: s3://kdc-comscore/parquet-extracts/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_37195e09-fc21-4f61-a2c7-316de42872c8 to output/parquet/url_/year=2019/month=01/day=01/20230117_055525_00007_7jgea_37195e09-fc21-4f6

In [8]:
df = pd.read_parquet("./output/parquet/url_domain_twitch")

In [9]:
df.shape

(873668, 21)

In [10]:
df.head()

Unnamed: 0,machine_id,url_idc,person_id,ss2k,time_id,domain_name,url_host,url_dir,url_page,url_refer_domain,...,url_refer_dir,url_refer_page,mimetype,http_rc,keywords,html_title,pattern_id,year,month,day
0,248349067,HNN7xYtNAgxFKQBsiDVyab,452606243,599780225,6941,twitch.tv,pubsub-edge.twitch.tv,/,v1,,...,,,,101,SECURE,SECURE,10355136,2019,1,1
1,263072658,mnOnzogikHbGW91fDq5i93,434113017,599786289,6941,twitch.tv,client-event-reporter.twitch.tv,/,/,,...,,,,200,SECURE,SECURE,10355136,2019,1,1
2,218778557,2OC8FWVwQumUMTh51$tLf0,392721004,599746847,6941,twitch.tv,pubsub-edge.twitch.tv,/,/,,...,,,,200,SECURE,SECURE,10355136,2019,1,1
3,218778557,Mqoo7ZXs4MNoHwDsnHxp0e,392721004,599752699,6941,twitch.tv,pubsub-edge.twitch.tv,/,/,,...,,,,200,SECURE,SECURE,10355136,2019,1,1
4,218778557,UwWxtCLDAHbWm95QD7vIf0,392721004,599724558,6941,twitch.tv,pubsub-edge.twitch.tv,/,/,,...,,,,200,SECURE,SECURE,10355136,2019,1,1
