### Imported Required libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")


### Load Data

In [14]:
songs = pd.read_csv("../data/songs.csv")
streams = pd.read_csv("../data/streams1.csv")

print(f"Shape of the songs dataset: {songs.shape}")
print(f"Shape of streams dataset: {streams.shape}")

Shape of the songs dataset: (89741, 21)
Shape of streams dataset: (11346, 3)


### Genre Level KPIs

In [15]:
# preprocess - listen time 
streams["listen_time"] = streams["listen_time"].astype("datetime64[ns]")
streams["listen_date"] = streams["listen_time"].dt.date # get just the date information

# merge songs on track_id

try:
    assert(merged_data.shape[0] == streams.merge(songs, on = "track_id", how = "left").shape[0])
    merged_data = streams.merge(songs, on = "track_id", how = "left")
    print("=== SUCCESS : NO DUPLICATE VALUES WHILE PERFORMING JOINING ====")    
except:
    print("=== ERROR : DUPLICATE VALUES WHILE PERFORMING JOINING ===")

=== SUCCESS : NO DUPLICATE VALUES WHILE PERFORMING JOINING ====


#### 1. KPI 1: Daily Genre Listen Count

In [20]:
genre_listen_count = (merged_data.groupby(['listen_date', 'track_genre'])
        .size()
        .reset_index(name = "listen_count"))

#### KPI 2 : Average Listening Duration per Genre per Day

In [23]:
merged_data["duration_seconds"] = merged_data["duration_ms"] / 1000
avg_duration = (merged_data.groupby(["listen_date", "track_genre"])["duration_seconds"]
                        .mean()
                        .reset_index(name = "average_duration"))

#### KPI 3: Daily Genre Popularity Index

In [25]:
total_listens = merged_data.groupby('listen_date').size().reset_index(name='total_listens')
genre_listen_count = genre_listen_count.merge(total_listens, on='listen_date')
genre_listen_count['popularity_index'] = genre_listen_count['listen_count'] / genre_listen_count['total_listens']

#### KPI 4: Most Popular Track per Genre per Day

In [24]:
most_popular_track = merged_data.groupby(['listen_date', 'track_genre', 'track_id']).size().reset_index(name='track_count')
most_popular_track = most_popular_track.sort_values(by=['listen_date', 'track_genre', 'track_count'], ascending=[True, True, False])
most_popular_track = most_popular_track.drop_duplicates(subset=['listen_date', 'track_genre'], keep='first').rename(columns={'track_id': 'most_popular_track_id'})

#### Combine all KPIs into one DataFrame

In [26]:
final_kpis = genre_listen_count[['listen_date', 'track_genre', 'listen_count', 'popularity_index']]
final_kpis = final_kpis.merge(avg_duration, on=['listen_date', 'track_genre'])
final_kpis = final_kpis.merge(most_popular_track[['listen_date', 'track_genre', 'most_popular_track_id']], on=['listen_date', 'track_genre'])

In [27]:
final_kpis

Unnamed: 0,listen_date,track_genre,listen_count,popularity_index,average_duration,most_popular_track_id
0,2024-06-25,acoustic,118,0.010400,215.379034,0WWuB1F1H4dr3Bdoe1vtHs
1,2024-06-25,afrobeat,124,0.010929,248.978960,1KME77F9mu2RQS8vo6JVwa
2,2024-06-25,alt-rock,119,0.010488,236.073731,0GO8y8jQk1PkHzS31d699N
3,2024-06-25,alternative,45,0.003966,202.768200,0YwBrYaPYYc8e18ZYkqhJc
4,2024-06-25,ambient,119,0.010488,255.296067,2tr4oclswJ6v3dfDlI01HD
...,...,...,...,...,...,...
108,2024-06-25,techno,46,0.004054,296.812717,1Of5q9FxoyHnMsXEdl84bO
109,2024-06-25,trance,97,0.008549,298.194134,3W0wCpIAMV6NZeoYlCd4JY
110,2024-06-25,trip-hop,122,0.010753,263.405664,3cg38isdTrBH63B4BMywsw
111,2024-06-25,turkish,112,0.009871,223.225420,1067ZkQkZgEHszEfnFWEgM
