### Imported Required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")


### Load datasets

In [4]:
songs = pd.read_csv('../data/songs.csv')
streams = pd.read_csv('../data/streams1.csv')
users = pd.read_csv('../data/users.csv')

print(f"Shape of the songs : {songs.shape}")
print(f"Shape of the streams : {streams.shape}")
print(f"Shape of the users : {users.shape}")

Shape of the songs : (89741, 21)
Shape of the streams : (11346, 3)
Shape of the users : (50000, 5)


In [5]:
# Prepare data
streams['listen_date'] = pd.to_datetime(streams['listen_time']).dt.date
streams['listen_hour'] = pd.to_datetime(streams['listen_time']).dt.hour
full_data = streams.merge(songs, on='track_id').merge(users, on='user_id')

# KPI 1: Hourly Unique Listeners
hourly_unique_listeners = full_data.groupby(['listen_date', 'listen_hour'])['user_id'].nunique().reset_index(name='unique_listeners')

# KPI 2: Top Listened Artist of the Hour
artist_listen_counts = full_data.groupby(['listen_date', 'listen_hour', 'artists']).size().reset_index(name='listen_counts')
top_artist = artist_listen_counts.loc[artist_listen_counts.groupby(['listen_date', 'listen_hour'])['listen_counts'].idxmax()]
top_artist = top_artist.rename(columns={'artists': 'top_artist'})

# KPI 3: Listening Sessions per User per Hour
full_data['session_id'] = full_data['user_id'].astype(str) + '-' + full_data['listen_time'].astype(str)
sessions_per_user = full_data.groupby(['listen_date', 'listen_hour', 'user_id']).nunique('session_id').reset_index()
avg_sessions_per_user = sessions_per_user.groupby(['listen_date', 'listen_hour'])['session_id'].mean().reset_index(name='avg_sessions_per_user')

# KPI 4: Hourly Track Diversity Index
track_diversity = full_data.groupby(['listen_date', 'listen_hour'])['track_id'].agg(['nunique', 'count']).reset_index()
track_diversity['diversity_index'] = track_diversity['nunique'] / track_diversity['count']

# KPI 5: Most Engaged User Group by Age per Hour
# Assuming age groups are segmented as 18-25, 26-35, etc.
users['age_group'] = pd.cut(users['user_age'], bins=[0, 25, 35, 45, 55, 65, 100], labels=['18-25', '26-35', '36-45', '46-55', '56-65', '66+'])
user_group_engagement = full_data.merge(users, on='user_id').groupby(['listen_date', 'listen_hour', 'age_group']).size().reset_index(name='streams')
most_engaged_group = user_group_engagement.loc[user_group_engagement.groupby(['listen_date', 'listen_hour'])['streams'].idxmax()].rename(columns={'age_group': 'most_engaged_age_group'})

# Combine all KPIs into one DataFrame
final_kpis = hourly_unique_listeners.merge(top_artist, on=['listen_date', 'listen_hour'])
final_kpis = final_kpis.merge(avg_sessions_per_user, on=['listen_date', 'listen_hour'])
final_kpis = final_kpis.merge(track_diversity[['listen_date', 'listen_hour', 'diversity_index']], on=['listen_date', 'listen_hour'])
final_kpis = final_kpis.merge(most_engaged_group[['listen_date', 'listen_hour', 'most_engaged_age_group']], on=['listen_date', 'listen_hour'])

In [6]:
final_kpis

Unnamed: 0,listen_date,listen_hour,unique_listeners,top_artist,listen_counts,avg_sessions_per_user,diversity_index,most_engaged_age_group
0,2024-06-25,0,376,Hank Williams,4,1.289894,0.997938,46-55
1,2024-06-25,1,383,BTS,3,1.234987,0.995772,36-45
2,2024-06-25,2,356,Dani Fernández,3,1.311798,0.995717,56-65
3,2024-06-25,3,369,Apollo 440,3,1.276423,0.993631,46-55
4,2024-06-25,4,379,George Jones,5,1.229551,0.995708,26-35
5,2024-06-25,5,372,AJR,2,1.263441,0.997872,46-55
6,2024-06-25,6,362,Exaltasamba,3,1.273481,1.0,26-35
7,2024-06-25,7,372,Sorriso Maroto,3,1.287634,1.0,26-35
8,2024-06-25,8,375,At The Gates,2,1.277333,1.0,36-45
9,2024-06-25,9,417,Arctic Monkeys,4,1.256595,0.998092,46-55
