In [1]:
import awswrangler as wr
import pandas as pd

from data_sci_toolkit.aws_tools import permission_tools 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



In [2]:
SESSION = permission_tools.get_aws_session("Dev-DataScienceAdmin")

Attempting to automatically open the SSO authorization page in your default browser.
If the browser does not open or you wish to use a different device to authorize this request, open the following URL:

https://device.sso.us-east-1.amazonaws.com/

Then enter the code:

HNRS-CPPR
Successfully logged into Start URL: https://stellaralgo.awsapps.com/start#/
Authorized as AROAQBUNWK7XN5DTYWPLJ:rkazmerik@stellaralgo.com


In [3]:
DATABASE = "integrations_milb_aviators"
S3_BUCKET = "s3://dev-athena-testing-us-east-1"

In [4]:
df = wr.athena.read_sql_query(
    boto3_session= SESSION, 
    ctas_approach= False,
    database= DATABASE,
    s3_output= S3_BUCKET,
    sql= """
        WITH customers as (
            select * 
            from integrations_milb_aviators.customers c
            CROSS JOIN UNNEST(account_ids['ticketing'], account_ids['retail']) as t(ticket_ids, retail_ids)
        ),
        ticketing AS (
            select *
            from integrations_milb_aviators.tickets t
            CROSS JOIN UNNEST(resales, forwards, scans) as t(resale_data, forward_data, scan_data)
            where returned_at IS NULL
        )
        
        select * 
        from ticketing t
        INNER JOIN integrations_milb_aviators.ticketing_events e
        on t.event_id = e.id
        INNER JOIN customers c
        on c.ticket_ids = t.account_id limit 10 
    """
)
df.shape

(10, 76)

In [26]:
sql_2 = """with 
customers as (
    select c.id as scv_id, source_system_type, account_id, venue
    from integrations_milb_aviators.customers as c
    cross join unnest(account_ids) as t(source_system_type, source_account_ids)
    cross join unnest(source_account_ids) as a(account_id)
    cross join unnest(venues) as v(venue)
),
ticketing_customers as (
    select 
        c.scv_id, 
        c.venue.distance as distance_to_venue, 
        te.season_year, 
        t.event_id,
        te.start_date as event_date,
        t.stlr_product, 
        t.seat_id,
        t.price,
        t.purchased_at,
        t.scans is not null as was_attended
    from customers as c
    join tickets as t on t.account_id = c.account_id and source_system_type = 'ticketing'
    join ticketing_events as te on te.id = t.event_id
)

select c.*, date_diff('day', first_purchase_date, CURRENT_DATE) as tenure
from (
    select 
        scv_id, 
        distance_to_venue, 
        stlr_product, 
        season_year, 
        max(event_date) as last_event_date,
        count(*) as tickets_purchased, 
        sum(price) as total_spent, 
        (1.0 * sum(case when was_attended then 1 else 0 end)) / count(*) as attendance_percentage,
        max(case when was_attended then event_date else null end) as last_attendance_date,
        (lead(count(*), 1) over (partition by scv_id order by season_year) is not null) as is_next_year_buyer
    from ticketing_customers
    group by scv_id, distance_to_venue, stlr_product, season_year) as c
join (
    select scv_id, min(purchased_at) as first_purchase_date
    from ticketing_customers
    group by scv_id
) as t
on c.scv_id = t.scv_id"""

In [27]:
df_2 = wr.athena.read_sql_query(
    boto3_session= SESSION, 
    ctas_approach= False,
    database= DATABASE,
    s3_output= S3_BUCKET,
    sql= sql_2
)
df_2.shape

(201687, 11)

In [28]:
df_2.head()

Unnamed: 0,scv_id,distance_to_venue,stlr_product,season_year,last_event_date,tickets_purchased,total_spent,attendance_percentage,last_attendance_date,is_next_year_buyer,tenure
0,00005d638f4a29b761e274cd5610a15c,19.413923,Individual,2013,2013-05-13 00:00:00,2,28.0,1.0,2013-05-13 00:00:00,True,3613
1,00005d638f4a29b761e274cd5610a15c,19.413923,Individual,2022,2022-09-24 06:00:00,4,124.0,1.0,2022-09-24 06:00:00,False,3613
2,000306d7b9ba944964bc185463fdd6ea,544.084965,Group,2013,2013-05-31 00:00:00,1,14.0,0.0,NaT,False,3595
3,000587e1c86c0277ff9d85f5b9ca90c8,1.60848,Individual,2019,2019-06-21 00:00:00,2,54.0,1.0,2019-06-21 00:00:00,True,1444
4,000587e1c86c0277ff9d85f5b9ca90c8,1.60848,Individual,2021,2021-07-02 00:00:00,2,60.0,1.0,2021-07-02 00:00:00,False,1444


In [None]:
sql_3 = """
with 
customers as (
    select scv_id, distance_to_venue, min(purchased_at) as first_purchase_date
    from integrations_ds_milb_aviators.cohort_ticketing
    group by scv_id, distance_to_venue
),
tickets as (
    select 
        scv_id, 
        product, 
        season_year, 
        event_date,
        price, 
        game_number,
        filter(scans, x -> x.account_id = account_id and x.is_valid) as owner_scans
    from integrations_ds_milb_aviators.cohort_ticketing
),
games as (
    select 
        *,
        lead(game_number, 1) over (partition by scv_id, product, season_year order by game_number) as next_game_number
    from (
        select distinct
            scv_id, 
            product,
            season_year,
            game_number,
            first_value(game_number) over (partition by product, season_year order by game_number desc) as max_game_number
        from tickets
    )
    where game_number is not null
),
seasons as (
    select 
        t.scv_id, 
        t.product, 
        t.season_year, 
        max(t.event_date) as last_event_date,
        count(*) as tickets_purchased, 
        sum(t.price) as total_spent, 
        (1.0 * sum(case when t.owner_scans is not null and cardinality(t.owner_scans) > 1 then 1 else 0 end)) / count(*) as attendance_percentage,
        max(case when t.owner_scans is not null and cardinality(t.owner_scans) > 1 then event_date else null end) as last_attendance_date,
        (lead(count(*), 1) over (partition by t.scv_id order by t.season_year) is not null) as is_next_year_buyer,
        sum(
            case when coalesce(g.next_game_number, g.max_game_number) - g.game_number > 1 then 1 else 0 end
        ) as consecutive_games_missed
    from tickets as t
    left join games as g on t.scv_id = g.scv_id and t.season_year = g.season_year and t.product = g.product
    group by t.scv_id, t.product, t.season_year
)

select 
    s.*,
    date_diff('day', c.first_purchase_date, s.last_event_date) as tenure,
    date_diff('day', s.last_event_date, localtimestamp) as recency,
    c.distance_to_venue,
    c.first_purchase_date
from seasons as s
left join games as g on s.scv_id = g.scv_id and s.season_year = g.season_year and s.product = s.product
left join customers as c on s.scv_id = c.scv_id
"""

In [None]:
df_3 = wr.athena.read_sql_query(
    boto3_session= SESSION, 
    ctas_approach= False,
    database= DATABASE,
    s3_output= S3_BUCKET,
    sql= sql_3
)
df_3.shape

In [None]:
df_3.head()