In [1]:
from rca import Db, Spotify
import pandas as pd
import numpy as np
import statistics as stats
import datetime as datetime
from datetime import timedelta

# Release Date Function (just for predicting release date)

In [2]:
def cleanReleaseDate(df):
    """
    Looks at the streaming data for a song surrounding the release date provided by Nielsen
    and predicts what the actual release date of that song is based on the largest day-to-day
    streaming increase within 21 days of the first recorded non-zero streaming day
    ----
    Also works with a df having than 1 song
    """
    
    # Preliminary cleaning
    df.fillna(0, inplace=True)
    df['streams'] = df['streams'].astype(int)
    df['date'] = pd.to_datetime(df['date'])
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['artist'] = df['artist'].astype(str)
    
    def clean_streaming_data(df):

        # Creating a day-to-day streams difference column
        df_copy = df.copy()
        df_copy['streams_diff'] = df_copy['streams'].diff()

        def get_release_date(x):
            first_streams = x[x['streams'] > 0]['date'].min()
            if pd.isnull(first_streams):
                return np.nan
            window = x[
                (x['date'] >= first_streams) & 
                (x['date'] <= first_streams + pd.Timedelta(days=21))
            ]
            return window['streams_diff'].idxmax()
        
        # release_dates gets the song id and streams on the date with the largest jump (1 row per song)
        release_dates = df_copy.groupby('song_id').apply(get_release_date)
        release_dates = release_dates.dropna()

        # gets df_copy with 1 row per song, row represents the predicted release date
        release_dates = df_copy.loc[release_dates]
        
        clean_df = pd.DataFrame()
        
        for song_id, release_row in release_dates.iterrows():
            song_data = df[df['song_id'] == release_row['song_id']]
            release_date = release_row['date']
            # end_date = release_date + pd.Timedelta(days=364)
            
            # Get data in release window
            window_data = song_data[
                (song_data['date'] >= release_date) 
                # & 
                # (song_data['date'] <= end_date)
            ]
            
            # Check for >330 active stream days in window
            # active_days = len(window_data[window_data['streams'] > 50])
            
            # if active_days >= 330:
            post_release = song_data[song_data['date'] >= release_date].copy()
            post_release.loc[:, 'release_date'] = release_date
            clean_df = pd.concat([clean_df, post_release])

        return clean_df.reset_index(drop=True)
    
    return clean_streaming_data(df)

In [3]:
country_df = pd.read_csv('country.csv')
country_df

Unnamed: 0,song_id,artist,title,release_date,date,streams
0,1531675044,Grant Gilbert,Turn It Down,2023-08-25,2023-08-13,0.0
1,1531675044,Grant Gilbert,Turn It Down,2023-08-25,2023-08-14,0.0
2,1531675044,Grant Gilbert,Turn It Down,2023-08-25,2023-08-15,0.0
3,1531675044,Grant Gilbert,Turn It Down,2023-08-25,2023-08-16,0.0
4,1531675044,Grant Gilbert,Turn It Down,2023-08-25,2023-08-17,0.0
...,...,...,...,...,...,...
664397,995180282,Graham Barham,Beer By My Bed,2023-02-10,2024-11-26,7889.0
664398,995180282,Graham Barham,Beer By My Bed,2023-02-10,2024-11-27,7857.0
664399,995180282,Graham Barham,Beer By My Bed,2023-02-10,2024-11-28,5784.0
664400,995180282,Graham Barham,Beer By My Bed,2023-02-10,2024-11-29,5988.0


In [5]:
clean_country = cleanReleaseDate(country_df)
clean_country

  release_dates = df_copy.groupby('song_id').apply(get_release_date)


Unnamed: 0,song_id,artist,title,release_date,date,streams
0,934299146,Josh Ross,Trouble,2023-01-04,2023-01-04,98508
1,934299146,Josh Ross,Trouble,2023-01-04,2023-01-05,83909
2,934299146,Josh Ross,Trouble,2023-01-04,2023-01-06,144376
3,934299146,Josh Ross,Trouble,2023-01-04,2023-01-07,113416
4,934299146,Josh Ross,Trouble,2023-01-04,2023-01-08,92796
...,...,...,...,...,...,...
647239,2888532856,Phillip Good & Johnny Walker,Bad Boyz,2024-11-12,2024-11-26,1809
647240,2888532856,Phillip Good & Johnny Walker,Bad Boyz,2024-11-12,2024-11-27,1771
647241,2888532856,Phillip Good & Johnny Walker,Bad Boyz,2024-11-12,2024-11-28,1213
647242,2888532856,Phillip Good & Johnny Walker,Bad Boyz,2024-11-12,2024-11-29,1286


In [6]:
print(len(country_df['song_id'].unique()))
print(len(clean_country['song_id'].unique()))

1295
1295
