In [1]:
import requests
import boto3
import os
from dotenv import load_dotenv
import base64
import json
import datetime
import jwt
import pandas as pd
import pyarrow as pa
import logging
import pyarrow.parquet as pq
from io import BytesIO

In [None]:
load_dotenv()

In [10]:
kID = os.getenv("KID") #unique identifier associated with private keys used for signing and authentication in apple developer ecosystem
#needed for access to the authkey file obtained from apple developer account
#this file is used to sign the JWT token
iss = os.getenv("ISS")

In [11]:
header = {
    "alg": "ES256",
    "kid": kID
}

In [22]:
with open(f'keys/AuthKey_{kID}.p8', 'rb') as f:
    private_key = f.read()

In [26]:
claims = {
    "iss": iss,  # Issuer of the token - unique identifier for developer account obtained from apple's website
    "exp": datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(hours=24), # Expiration time
    "iat": datetime.datetime.now(datetime.timezone.utc), # Issued at time

}

In [27]:
encoded_jwt = jwt.encode(claims, private_key, algorithm='ES256', headers=header)

In [28]:
response = requests.get('https://api.music.apple.com/v1/test',
                             headers = {"Authorization": f"Bearer  {encoded_jwt}"})

In [None]:
response

In [12]:
storefront = 'us'
genre_id = '1765343179'

In [13]:
response = requests.get('https://api.music.apple.com/v1/catalog/us/genres/14',
                             headers = {"Authorization": f"Bearer  {encoded_jwt}"})

In [None]:
response.json()

In [15]:
type = 'albums'

In [16]:
chart_response = requests.get(f'https://api.music.apple.com/v1/catalog/{storefront}/charts?types={type}&chart=most-played',
                              headers = {"Authorization": f"Bearer  {encoded_jwt}"})

In [None]:
chart_response.json()['results']['albums'][0]

In [None]:
for i in chart_response.json()['results']['albums'][0]:
    print(i)

In [None]:
chart_response.json()['results']['albums'][0]['data'][1]

In [None]:
len(chart_response.json()['results']['albums'][0]['data'])

In [17]:
album_list = []

In [18]:
for i in range(len(chart_response.json()['results']['albums'][0]['data'])):
    album_name = chart_response.json()['results']['albums'][0]['data'][i]['attributes']['name']
    album_artist = chart_response.json()['results']['albums'][0]['data'][i]['attributes']['artistName']
    release_date = chart_response.json()['results']['albums'][0]['data'][i]['attributes']['releaseDate']
    key = i
    album_list.append([key, album_name, album_artist, release_date])

In [19]:
df = pd.DataFrame(album_list, columns=['key', 'album_name', 'album_artist', 'release_date'])

In [20]:
df['ingest_ts'] = datetime.datetime.now()

In [28]:
df.to_parquet(f'data/apple_music_{pd.to_datetime('today').date()}.parquet', engine='pyarrow')

In [24]:

s3 = boto3.client('s3')


In [30]:
bucket_name = 'apple-albums'
s3.upload_file('data/apple_music_2025-05-09.parquet', Bucket=bucket_name, Key='05-09-2025/apple_music_2025-05-09.parquet')


In [None]:
str(pd.to_datetime('today').date()) +'/'+ os.path.basename('data/apple_music_2025-05-09.parquet')

In [None]:
def get_jwt_token(key_file_path=None, kID=kID, iss=iss):
    header = {
        "alg": "ES256",
        "kid": kID
    }
    if key_file_path is None:
        key_file_path = os.path.join(os.getcwd(), 'keys')

    with open(f'{key_file_path}/AuthKey_{kID}.p8', 'rb') as f:
        private_key = f.read()

    claims = {
        "iss": iss,  # Issuer of the token
        "exp": datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(hours=24), # Expiration time
        "iat": datetime.datetime.now(datetime.timezone.utc), # Issued at time

    }

    encoded_jwt = jwt.encode(claims, private_key, algorithm='ES256', headers=header)
    return encoded_jwt

In [None]:
def get_albums():
    storefront = 'us'
    type = 'albums'
    album_list = []

    chart_response = requests.get(f'https://api.music.apple.com/v1/catalog/{storefront}/charts?types={type}&chart=most-played',
                                  headers={"Authorization": f"Bearer  {get_jwt_token()}"})
    
    for i in range(len(chart_response.json()['results']['albums'][0]['data'])):
        album_name = chart_response.json()['results']['albums'][0]['data'][i]['attributes']['name']
        album_artist = chart_response.json()['results']['albums'][0]['data'][i]['attributes']['artistName']
        release_date = chart_response.json()['results']['albums'][0]['data'][i]['attributes']['releaseDate']
        key = i
        album_list.append([key, album_name, album_artist, release_date])

    df = pd.DataFrame(album_list, columns=['key', 'album_name', 'album_artist', 'release_date'])
    df['ingest_ts'] = datetime.datetime.now()

    return df

In [146]:
def upload_df_to_s3_parquet(df, bucket_name, file_path=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    buffer = BytesIO()

    table = pa.Table.from_pandas(df)
    pq.write_table(table, buffer)
    buffer.seek(0)

    if file_path is None:
        file_path = str('silver/apple-albums/' + pd.to_datetime('today').date()) + '/albums_' + str(pd.to_datetime('today').date()) + '.parquet' 

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_fileobj(buffer, bucket_name, file_path)
    except Exception as e:
        logging.error(f"Failed to upload {file_path} to {bucket_name}/{file_path}")
        raise e
    
    return True

In [4]:
s3 = boto3.client('s3')
df = pd.read_parquet('s3://apple-albums/2025-06-09/albums_2025-06-09.parquet', engine='pyarrow')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df

In [33]:
storefront = 'us'
type = 'albums'

In [34]:
chart_response = requests.get(f'https://api.music.apple.com/v1/catalog/us/charts?types={type}&chart=most-played',
                                  headers={"Authorization": f"Bearer  {encoded_jwt}"})

In [None]:
chart_response.json()['results']

In [43]:
new_chart_response = requests.get(f'https://api.music.apple.com/v1/catalog/us/charts?types={type}',
                                  headers={"Authorization": f"Bearer  {encoded_jwt}"})

In [None]:
chart_response.json()['results']['albums'][0]

In [5]:
sg_response = session.get('https://api.seatgeek.com/2/events', params={'venue.city': 'Austin', 'per_page': 5, 'page': 1})

In [3]:
def get_s3_object(bucket_name, key):
    """Retrieve an object from S3 bucket."""
    s3_client = boto3.client('s3', region_name='us-east-2')
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=key)
        return response['Body'].read()
    except Exception as e:
        logging.error(f"Failed to retrieve {key} from {bucket_name}: {e}")
        return None

In [None]:
 file = get_s3_object(bucket_name='artist-discovery-data', key='silver/apple-albums/albums_2025-06-16.parquet')


In [159]:
df = pd.read_parquet(BytesIO(file), engine='pyarrow')

In [None]:
df.head(20)

In [None]:
df['album_artist'].tolist()

In [None]:
def clean_artist_names(df):
    """Clean artist names by removing leading/trailing whitespace and splitting on '&'."""
    df['album_artist_clean'] = df['album_artist'].str.split('&', expand=True)[0]
    df['album_artist_clean'] = df['album_artist_clean'].str.strip()
    return df


In [None]:
clean_artist_names(df)

In [33]:
session = requests.Session()
session.auth = (os.getenv('SEATGEEK_CLIENT_ID'), os.getenv('SEATGEEK_CLIENT_SECRET'))

In [110]:
sg_response = session.get('https://api.seatgeek.com/2/performers', params={'q': 'Fuerza Regida', 'per_page': 5, 'page': 1})

In [132]:
def remove_nan_from_list(input_list):
       return [x for x in input_list if pd.notna(x)]

In [None]:
df.head(20)

In [40]:
def get_artist_id(name=None, session=None):
    """Get the artist ID from SeatGeek"""
    if name is None:
        return None 
    
    response = session.get('https://api.seatgeek.com/2/performers', params={'q': name, 'per_page': 5, 'page': 1})
    return response.json()['performers'][0]['id'] if response.json()['performers'] else None

In [165]:
response = session.get('https://api.seatgeek.com/2/performers', params={'q': 'Lin-Manuel Miranda, Leslie Odom', 'per_page': 5, 'page': 1})


In [None]:
len(response.json()['performers'])

In [None]:

def transform_create_sg_data(df: pd.DataFrame):
    """This function transforms the input DataFrame by cleaning artist names 
    and fetching artist IDs and creates a new dataset with upcoming events from SeatGeek."""

    artist_df = clean_artist_names(df)
    with requests.Session() as session:
        session.auth = (os.getenv('SEATGEEK_CLIENT_ID'), os.getenv('SEATGEEK_CLIENT_SECRET'))
        df['artist_sg_index'] = df[['album_artist_clean']].apply(lambda x: get_artist_id(name=x, session=session), axis=1)
        df['artist_sg_index'] = df['artist_sg_index'].astype('Int64')
        df['ingest_ts'] = datetime.datetime.now()

        artist_id_list = remove_nan_from_list(df['artist_sg_index'].unique().tolist())

        artist_events_list = []
        for i in artist_id_list:
            artist_events_list.append(get_upcoming_events(artist_id=i, session=session))


    artist_events_df = pd.concat(artist_events_list, ignore_index=True)
    artist_events_df['ingest_ts'] = datetime.datetime.now()
    return [artist_df, artist_events_df]
        
    

    
    

In [None]:
df1, df2 = transform_create_sg_data(df)

In [None]:
ssm_client = boto3.client('ssm', region_name='us-east-2')
sg_key = ssm_client.get_parameter(
        Name='/seatgeek/client_key',
        WithDecryption=True
    )['Parameter']['Value']

In [179]:
sg_secret = ssm_client.get_parameter(
        Name='/seatgeek/client_secret',
        WithDecryption=True
    )['Parameter']['Value']

In [177]:
key = os.getenv('SEATGEEK_CLIENT_ID')
client = os.getenv('SEATGEEK_CLIENT_SECRET')

In [None]:
print(f"key: {key} , secret: {client}")

In [None]:
sg_secret

In [72]:
sg_response = session.get('https://api.seatgeek.com/2/events', params={'performers.id':774960, 'per_page': 5, 'page': 1})

In [None]:
sg_response = session.get('https://api.seatgeek.com/2/performers/774960')

In [None]:
for i in sg_response.json()['events']:
    print(i)

In [None]:
sg_response.json()['events'][1]

In [None]:
sg_response.json()['events'][1]['datetime_local']

In [None]:
dateobj = datetime.datetime.fromisoformat(sg_response.json()['events'][1]['datetime_local']) - datetime.datetime.now()
dateobj < datetime.timedelta(days=30)

In [None]:

# convert to string
date_time_str = datetime.datetime.fromisoformat(sg_response.json()['events'][1]['datetime_local']).strftime("%Y-%m-%d %H:%M:%S")
print('DateTime String:', date_time_str)

# Output 2021-07-20 16:26:24

In [None]:
def get_upcoming_events(artist_id, session=None):
    """Get upcoming events for a given artist ID."""
    if session is None:
        return None
    
    response = session.get('https://api.seatgeek.com/2/events', params={'performers.id': artist_id, 'per_page': 5, 'page': 1})
    events = response.json()['events']
    
    upcoming_events = []
    for event in events:
        event_date_time = datetime.datetime.fromisoformat(event['datetime_local'])
        if event_date_time > datetime.datetime.now() and (event_date_time - datetime.datetime.now()) < datetime.timedelta(days=30):
            upcoming_events.append({
                'event_id': event['id'],
                'artist_id': artist_id,
                'event_name': event['title'],
                'event_date_time': event_date_time,
                'venue_name': event['venue']['name'],
                'venue_city': event['venue']['city'],
                'venue_state': event['venue']['state'],
                'venue_type': event['type']
            })

    try:
        if upcoming_events:
            logging.info(f"Found {len(upcoming_events)} upcoming events for artist ID {artist_id}.")
            df = pd.DataFrame(upcoming_events)
            df['event_date'] = df['event_date_time'].dt.date
            return df
        else:
            logging.warning("No upcoming events found for the artist in the next 30 days.")

    except Exception as e:
        logging.error("Failed to create DataFrame from upcoming events.")
        raise e
        
            


In [None]:
get_upcoming_events(artist_id=616548, session=session)

In [None]:
test = get_upcoming_events(artist_id=774960, session=session)

In [None]:
df=pd.DataFrame()
pd.concat([df,test])

In [None]:
artist_df, artist_events_df = transform_create_sg_data(df)

In [None]:
bucket_name = 'artist-discovery-data'
artist_df_file_path = 'silver/apple-albums-enriched/' + str(pd.to_datetime('today').date()) + '/albums_enriched' + str(pd.to_datetime('today').date()) + '.parquet'

In [None]:
upload_df_to_s3_parquet(artist_df, bucket_name, file_path=artist_df_file_path)

In [None]:
import sys
print(sys.path)

In [151]:
test_file = None

In [7]:
bucket_name = 'artist-discovery-data'
album_key = 'silver/apple-albums-enriched/albums_enriched_2025-06-17.parquet'
album_file = get_s3_object(bucket_name, album_key)

In [8]:
events_key = 'silver/artist-events/events_2025-06-17.parquet'
events_file = get_s3_object(bucket_name, events_key)

In [10]:
album_df = pd.read_parquet(BytesIO(album_file), engine='pyarrow')
events_df = pd.read_parquet(BytesIO(events_file), engine='pyarrow')

In [11]:
album_df.head()

Unnamed: 0,key,album_name,album_artist,release_date,ingest_ts,album_artist_clean,artist_sg_index
0,0,I’m The Problem,Morgan Wallen,2025-05-16,2025-06-17 00:08:07.865876,Morgan Wallen,562860
1,1,DOPAMINE,Lil Tecca,2025-06-13,2025-06-17 00:08:07.865876,Lil Tecca,751467
2,2,Tha Carter VI (Bonus),Lil Wayne,2025-06-06,2025-06-17 00:08:07.865876,Lil Wayne,1109
3,3,My World,EST Gee,2025-06-13,2025-06-17 00:08:07.865876,EST Gee,797677
4,4,111XPANTIA,Fuerza Regida,2025-05-02,2025-06-17 00:08:07.865876,Fuerza Regida,774960


In [12]:
events_df.head()

Unnamed: 0,event_id,artist_id,event_name,event_date_time,venue_name,venue_city,venue_state,venue_type,event_date,ingest_ts
0,17357211,562860,Morgan Wallen with Corey Kent and Koe Wetzel,2025-06-20 17:30:00,NRG Stadium,Houston,TX,concert,2025-06-20,2025-06-17 00:08:10.206853
1,17357215,562860,Morgan Wallen with Corey Kent and Koe Wetzel,2025-06-21 17:30:00,NRG Stadium,Houston,TX,concert,2025-06-21,2025-06-17 00:08:10.206853
2,17357249,562860,Morgan Wallen with Miranda Lambert and Ella La...,2025-06-28 17:30:00,Camp Randall Stadium,Madison,WI,concert,2025-06-28,2025-06-17 00:08:10.206853
3,17365697,562860,Morgan Wallen with Miranda Lambert and Ella La...,2025-06-29 17:30:00,Camp Randall Stadium,Madison,WI,concert,2025-06-29,2025-06-17 00:08:10.206853
4,17357217,562860,Morgan Wallen with Miranda Lambert and Gavin A...,2025-07-11 16:30:00,Hard Rock Stadium,Miami Gardens,FL,concert,2025-07-11,2025-06-17 00:08:10.206853


In [17]:
album_set = album_df[['album_artist','album_name', 'release_date', 'artist_sg_index']]
event_set = events_df[['artist_id', 'event_id', 'event_name', 'event_date_time', 'venue_name', 'venue_city', 'venue_state', 'venue_type']]
merged_df = album_set.merge(
    event_set,
    how='left',
    left_on='artist_sg_index',
    right_on='artist_id'
)

def merge_album_events(album_df, events_df):
    """Merge album DataFrame with events DataFrame on artist_id."""
    merged_df = album_df.merge(
        events_df,
        how='left',
        left_on='artist_sg_index',
        right_on='artist_id'
    )
    return merged_df



In [15]:
merged_df.head(10)

Unnamed: 0,album_artist,album_name,release_date,artist_sg_index,artist_id,event_name,event_date_time,venue_name,venue_city,venue_state,venue_type
0,Morgan Wallen,I’m The Problem,2025-05-16,562860,562860.0,Morgan Wallen with Corey Kent and Koe Wetzel,2025-06-20 17:30:00,NRG Stadium,Houston,TX,concert
1,Morgan Wallen,I’m The Problem,2025-05-16,562860,562860.0,Morgan Wallen with Corey Kent and Koe Wetzel,2025-06-21 17:30:00,NRG Stadium,Houston,TX,concert
2,Morgan Wallen,I’m The Problem,2025-05-16,562860,562860.0,Morgan Wallen with Miranda Lambert and Ella La...,2025-06-28 17:30:00,Camp Randall Stadium,Madison,WI,concert
3,Morgan Wallen,I’m The Problem,2025-05-16,562860,562860.0,Morgan Wallen with Miranda Lambert and Ella La...,2025-06-29 17:30:00,Camp Randall Stadium,Madison,WI,concert
4,Morgan Wallen,I’m The Problem,2025-05-16,562860,562860.0,Morgan Wallen with Miranda Lambert and Gavin A...,2025-07-11 16:30:00,Hard Rock Stadium,Miami Gardens,FL,concert
5,Lil Tecca,DOPAMINE,2025-06-13,751467,751467.0,The Summer Smash Music Festival - (3-Day Pass)...,2025-06-20 03:30:00,SeatGeek Stadium,Bridgeview,IL,music_festival
6,Lil Wayne,Tha Carter VI (Bonus),2025-06-06,1109,1109.0,Lil Wayne (21+),2025-06-21 22:30:00,Zouk Nightclub,Las Vegas,NV,concert
7,Lil Wayne,Tha Carter VI (Bonus),2025-06-06,1109,1109.0,Lil Wayne (21+),2025-07-12 22:30:00,Zouk Nightclub,Las Vegas,NV,concert
8,EST Gee,My World,2025-06-13,797677,797677.0,EST Gee,2025-07-16 19:00:00,Elevation 27,Virginia Beach,VA,concert
9,Fuerza Regida,111XPANTIA,2025-05-02,774960,774960.0,Fuerza Regida,2025-06-20 20:00:00,Madison Square Garden,New York,NY,concert


In [18]:
import uuid
import hashlib

def generate_unique_id(field1, field2):
    combined_string = str(field1) + str(field2)
    hashed_string = hashlib.sha256(combined_string.encode()).hexdigest()
    return uuid.UUID(hashed_string[:32])

# Example usage
field1 = "value1"
field2 = 123
unique_id = generate_unique_id(field1, field2)
print(unique_id)

e7b63dd8-e6be-c465-462c-a451ba7e7d14


In [24]:
merged_df['id'] = merged_df[['artist_sg_index','event_id']].apply(lambda x: generate_unique_id(x.iloc[0], x.iloc[1]), axis=1)

In [28]:
merged_df = merged_df.filter(items=['id','album_artist','album_name','release_date','event_name','event_date_time','venue_name','venue_city','venue_state'])

In [31]:
upcoming_events_file_path = 'analytics/upcoming-events/date=' + str(pd.to_datetime('today').date()) + '/upcoming_events.parquet'
print(upcoming_events_file_path)

analytics/upcoming-events/date=2025-06-19/upcoming_events.parquet
