In [None]:
import pandas as pd
import plotly.express as px
import glob
import os

# Load Data
data_path = "streaming-history-data"
files = glob.glob(os.path.join(data_path, "Streaming_History_Audio*.json"))
print(f"Found {len(files)} files.")

# Combine all JSON files into a single DataFrame
data = pd.concat([pd.read_json(file) for file in files], ignore_index=True)
print(f"Data loaded with {data.shape[0]} rows.")


In [None]:
# Preprocess Data
data['ts'] = pd.to_datetime(data['ts'])  # Convert timestamp to datetime
data['artist'] = data['master_metadata_album_artist_name'].astype(str)
data['track'] = data['master_metadata_track_name'].astype(str)
data['album'] = data['master_metadata_album_album_name'].astype(str)

print(data[['ts', 'artist', 'track']])


In [16]:

# # Top Artists/Songs/Albums
# def plot_top_entities(df, column, top_n=10):
#     top_entities = df[column].value_counts().head(top_n)
#     fig = px.bar(top_entities, x=top_entities.index, y=top_entities.values,
#                  labels={'x': column, 'y': 'Plays'}, title=f"Top {top_n} {column}")
#     fig.show()
    
# Function to plot top N entities
def plot_top_entities(df, column, top_n=10):
    top_entities = df[column].value_counts().head(top_n)
    fig = px.bar(
        top_entities, 
        x=top_entities.index, 
        y=top_entities.values,
        labels={'x': column.capitalize(), 'y': 'Count'},
        title=f"Top {top_n} {column.capitalize()}s by Play Count"
    )
    fig.show()
    return top_entities
    
# Function to plot listening over time with adjustable time resolution
def plot_listening_over_time(df, artist=None, time_resolution='month'):
    filtered_df = df.copy()
    if artist:
        filtered_df = filtered_df[filtered_df['artist'].str.contains(artist, case=False, na=False)]
    
    # Define the grouping logic based on time resolution
    if time_resolution == 'day':
        filtered_df['time_group'] = filtered_df['ts'].dt.date
    elif time_resolution == 'week':
        filtered_df['time_group'] = filtered_df['ts'].dt.to_period('W').apply(lambda r: r.start_time)
    elif time_resolution == 'month':
        filtered_df['time_group'] = filtered_df['ts'].dt.to_period('M').apply(lambda r: r.start_time)
    elif time_resolution == 'year':
        filtered_df['time_group'] = filtered_df['ts'].dt.to_period('Y').apply(lambda r: r.start_time)
    else:
        raise ValueError("Invalid time_resolution. Choose from 'day', 'week', 'month', or 'year'.")

    # Aggregate listening activity
    time_data = (
        filtered_df.groupby('time_group')
        .size()
        .reset_index(name='Plays')
    )
    
    # Plot the line chart
    fig = px.line(
        time_data, 
        x='time_group', 
        y='Plays',
        labels={'time_group': 'Time', 'Plays': 'Number of Plays'},
        title=f"Listening Activity Over Time{' for ' + artist if artist else ''} (Resolution: {time_resolution.capitalize()})"
    )
    fig.show()
    
# Function to plot listening over time with multiple artists and adjustable time resolution
def plot_listening_over_time_multi(df, artists=None, time_resolution='month'):
    filtered_df = df.copy()
    
    # Filter for the specified artists (if provided)
    if artists:
        filtered_df = filtered_df[filtered_df['artist'].str.contains('|'.join(artists), case=False, na=False)]
    
    # Define the grouping logic based on time resolution
    if time_resolution == 'day':
        filtered_df['time_group'] = filtered_df['ts'].dt.date
    elif time_resolution == 'week':
        filtered_df['time_group'] = filtered_df['ts'].dt.to_period('W').apply(lambda r: r.start_time)
    elif time_resolution == 'month':
        filtered_df['time_group'] = filtered_df['ts'].dt.to_period('M').apply(lambda r: r.start_time)
    elif time_resolution == 'year':
        filtered_df['time_group'] = filtered_df['ts'].dt.to_period('Y').apply(lambda r: r.start_time)
    else:
        raise ValueError("Invalid time_resolution. Choose from 'day', 'week', 'month', or 'year'.")

    # Aggregate listening activity by time group and artist
    time_data = (
        filtered_df.groupby(['time_group', 'artist'])
        .size()
        .reset_index(name='Plays')
    )

    # Plot the line chart with one line per artist
    fig = px.line(
        time_data, 
        x='time_group', 
        y='Plays',
        color='artist',  # Each artist gets a separate line
        labels={'time_group': 'Time', 'Plays': 'Number of Plays', 'artist': 'Artist'},
        title=f"Listening Activity Over Time (Resolution: {time_resolution.capitalize()})"
    )
    fig.show()



# Function to analyze total listening duration
def plot_top_durations(df, column, top_n=10):
    durations = df.groupby(column)['ms_played'].sum().sort_values(ascending=False).head(top_n)
    durations /= 60000  # Convert milliseconds to minutes
    fig = px.bar(
        durations, 
        x=durations.index, 
        y=durations.values,
        labels={'x': column.capitalize(), 'y': 'Minutes Played'},
        title=f"Top {top_n} {column.capitalize()}s by Listening Time"
    )
    fig.show()


In [None]:

# Example Usage
top_artists = plot_top_entities(data, 'artist', top_n=40)
# print(top_artists.keys())
plot_top_entities(data, 'track', top_n=40)
plot_listening_over_time(data)
plot_listening_over_time(data, artist="Jon Bellion")
plot_listening_over_time_multi(data, artists=['Kanye West', 'Jon Bellion', 'AJR', 'Dominic Fike', '070 Shake', 'The Script'])
# plot_listening_over_time_multi(data, artists=['Kanye West', 'Jon Bellion', 'AJR', 'Quinn XCII', 'Kid Cudi',
#        'Childish Gambino', 'Dominic Fike', 'Frank Ocean', 'BROCKHAMPTON',
#        'Chance the Rapper', 'Rex Orange County', 'The Beatles', '070 Shake',
#        'Post Malone', 'Imagine Dragons', 'Grady', 'OneRepublic',
#        'Macklemore & Ryan Lewis', 'Francis and the Lights', 'Kevin Abstract'])
plot_top_durations(data, 'album', top_n=40)

