In [7]:
import pandas as pd
import numpy as np
from catnip.fla_redshift import FLA_Redshift
from sqlalchemy import null
from datetime import datetime

from prefect.blocks.system import Secret
from typing import Dict
from concurrent.futures import ThreadPoolExecutor

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import random

In [8]:
def get_redshift_credentials() -> Dict:

    cred_dict = {
        "dbname": Secret.load("stellar-redshift-db-name").get(),
        "host": Secret.load("stellar-redshift-host").get(),
        "port": 5439,
        "user": Secret.load("stellar-redshift-user-name").get(),
        "password": Secret.load("stellar-redshift-password").get(),

        "aws_access_key_id": Secret.load("fla-s3-aws-access-key-id-east-1").get(),
        "aws_secret_access_key": Secret.load("fla-s3-aws-secret-access-key-east-1").get(),
        "bucket": Secret.load("fla-s3-bucket-name-east-1").get(),
        "subdirectory": "us-east-1",

        "verbose": False,
    }

    return cred_dict

with ThreadPoolExecutor(1) as pool:
    rs_creds = pool.submit(lambda: get_redshift_credentials()).result()

In [9]:
q = """
SELECT
    ticket.event_datetime,
    purch_client_crm_id,
    MAX(CASE WHEN client_crm_id_scan IS NOT NULL THEN 1 ELSE 0 END) AS did_attend
FROM
    custom.cth_v_ticket_2526 ticket
LEFT JOIN
    custom.cth_v_attendance_2526 attendance
        ON ticket.event_datetime = attendance.event_datetime
        AND ticket.purch_client_crm_id = attendance.client_crm_id_purchase
WHERE
    ticket.event_datetime < current_date
GROUP BY
    ticket.event_datetime,
    purch_client_crm_id
"""

In [23]:
df = FLA_Redshift(**rs_creds).query_warehouse(sql_string = q)

# Sort by client and event date
df = df.sort_values(['purch_client_crm_id', 'event_datetime']).reset_index(drop = True)

# Identify start of new streak when current row is not immediately after the previous attended game (based on sequence, not date diff)
df['prev_client'] = df['purch_client_crm_id'].shift()
df['prev_event'] = df['event_datetime'].shift()
df['prev_attend'] = df['did_attend'].shift()

# New streak when client changes or row is not immediately after previous one (no consecutive attendance break)
df['new_streak'] = ~(
    (df['purch_client_crm_id'] != df['prev_client']) |
    ((df['did_attend'] != df['prev_attend'])) |
    ((df['did_attend'] == 0) & (df['prev_attend'] == 0))
)

df['streak_group'] = (~df['new_streak']).cumsum()

# Assign streak group using cumulative sum of new streak flags within each client
df['streak_length'] = 0

# Assign streak group using cumulative sum of new streak flags within each client
mask = df['did_attend'] == 1
df.loc[mask, 'streak_length'] = df.groupby(by = ['purch_client_crm_id','streak_group'])['new_streak'].cumcount() + 1

most_recent_date = df['event_datetime'].max()

df['is_most_recent'] = df['event_datetime'] == most_recent_date

df = df[['purch_client_crm_id','event_datetime', 'did_attend','streak_length','new_streak', 'is_most_recent']]

In [24]:
FLA_Redshift(**rs_creds).write_to_warehouse(df = df, table_name= "cth_v_attendance_streaks_2526")