In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.stats.api as sms
import seaborn as sns
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.graphics.tsaplots import plot_acf
import warnings
import csv
import json
import string
import re
from datetime import datetime

In [18]:
# required names, change the first 2 lines
year_range = '2024-2025'
# creates new files, used for analysis later
with_play_count = 'exportify_play_count/Combined_Play_Count'+year_range+'.csv'
song_age = 'song_age/song_age_data'+year_range+'.csv'

In [4]:
def remove_nan_or_empty_rows(df):
    df.replace("", float("nan"), inplace=True)

    df_cleaned = df.dropna()

    return df_cleaned

def preprocess_data(df):   
    # Convert 'Release Date' to age (years since release)
    def parse_release_date(date):
        try:
            if pd.isna(date):
                return None
            date_str = str(date)
            if len(date_str) == 4:  # If only the year is provided
                return datetime(int(date_str), 1, 1)
            return pd.to_datetime(date, errors='coerce')
        except:
            return None

    df['Release Date'] = df['Release Date'].apply(parse_release_date)
    df['Song Age'] = (datetime.now() - df['Release Date']).dt.days / 365.25
    df.drop(columns=['Release Date'], inplace=True)

    # Convert 'Key' and 'Time Signature' to categorical codes safely
    df['Key'] = pd.to_numeric(df['Key'], errors='coerce').fillna(0).astype(int)
    df['Time Signature'] = pd.to_numeric(df['Time Signature'], errors='coerce').fillna(0).astype(int)

    return df

def process_csv(input_file, output_file):
    df = pd.read_csv(input_file)

    #df = remove_nan_or_empty_rows(df)
    df = preprocess_data(df)


    df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")
    return df

In [5]:
process_csv(with_play_count, combined)
df = pd.read_csv(combined)

Processed data saved to all_combined/Combined_data2024-2025.csv


In [6]:
df_arranged = df[["Track Name", "Album Name", "Artist Name(s)", "Added At", "Genres", "Duration (ms)",
                "Popularity", "Danceability", "Key", "Loudness", "Speechiness", "Acousticness", "Instrumentalness", 
                "Liveness", "Valence", "Tempo", "Time Signature", "Song Age", "Play Count"]]
df_arranged.to_csv(combined, index=False)