In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
full_data = pd.read_csv('final_df_with_emotions.csv')
sentiment_columns = ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
top_genres = full_data['genre'].value_counts().nlargest(10).index

# filter for songs in the top genres and valid popularity scores
filtered_df = full_data[
    (full_data['genre'].isin(top_genres)) &
    (full_data['popularity'] > 0) &
    (full_data['popularity'].notna())
]

# get unique song identifiers after filtering
unique_songs = filtered_df['song_artist'].unique()

# split unique songs into sets
songs_train_tune, songs_test = train_test_split(
    unique_songs, test_size=0.2, random_state=42
)

# further split the train + tune set into train and tune
tune_ratio = 0.1 / (1 - 0.2)  
songs_train, songs_tune = train_test_split(
    songs_train_tune, test_size=tune_ratio, random_state=42
)

# filter the original dataframe to create the final splits
train_df = filtered_df[filtered_df['song_artist'].isin(songs_train)]
tune_df = filtered_df[filtered_df['song_artist'].isin(songs_tune)]
test_df = filtered_df[filtered_df['song_artist'].isin(songs_test)]

# ensure all songs have valid data (20x7)
def validate_data(df):
    valid_songs = (
        df.groupby('song_artist')
        .filter(lambda x: len(x) == 20)['song_artist']
        .unique()
    )
    return df[df['song_artist'].isin(valid_songs)]

train_df = validate_data(train_df)
tune_df = validate_data(tune_df)
test_df = validate_data(test_df)

print(f"Train set: {len(train_df)} rows, {train_df['song_artist'].nunique()} unique songs")
print(f"Tune set: {len(tune_df)} rows, {tune_df['song_artist'].nunique()} unique songs")
print(f"Test set: {len(test_df)} rows, {test_df['song_artist'].nunique()} unique songs")


Train set: 95060 rows, 4753 unique songs
Tune set: 13600 rows, 680 unique songs
Test set: 27180 rows, 1359 unique songs


In [4]:
train_df

Unnamed: 0,song_artist,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,...,stride,window_index,window_text,anger,disgust,fear,joy,neutral,sadness,surprise
360,anna sun - walk the moon,61,2012,alt-rock,0.472,0.844,10,-6.578,1,0.0540,...,20,0,screen falling off the door door hanging off t...,0.038343,0.014882,0.018864,0.005592,0.086255,0.783576,0.052488
361,anna sun - walk the moon,61,2012,alt-rock,0.472,0.844,10,-6.578,1,0.0540,...,20,1,fringes we tore up the walls we slept on couch...,0.833283,0.022907,0.058656,0.002851,0.039539,0.014575,0.028188
362,anna sun - walk the moon,61,2012,alt-rock,0.472,0.844,10,-6.578,1,0.0540,...,20,2,the east my car parked south your hands on my ...,0.403547,0.151907,0.099213,0.016981,0.198571,0.086270,0.043511
363,anna sun - walk the moon,61,2012,alt-rock,0.472,0.844,10,-6.578,1,0.0540,...,20,3,the wall on the west mezzanine we rattle this ...,0.545919,0.055544,0.080076,0.011758,0.215630,0.052453,0.038619
364,anna sun - walk the moon,61,2012,alt-rock,0.472,0.844,10,-6.578,1,0.0540,...,20,4,what do you know? this house is falling apart ...,0.051798,0.009747,0.018578,0.003834,0.017812,0.029777,0.868455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211855,recently - jim croce,12,2011,singer-songwriter,0.690,0.214,0,-17.495,1,0.0537,...,8,15,doesn't matter now who was wrong the future is,0.301191,0.063810,0.007405,0.004125,0.597054,0.017089,0.009325
211856,recently - jim croce,12,2011,singer-songwriter,0.690,0.214,0,-17.495,1,0.0537,...,8,16,tomorrow 'cause the past is gone and i'm,0.004772,0.006103,0.018585,0.004189,0.043673,0.916892,0.005787
211857,recently - jim croce,12,2011,singer-songwriter,0.690,0.214,0,-17.495,1,0.0537,...,8,17,findin' that i'm not as strong as i thought,0.010412,0.016579,0.293162,0.020691,0.045054,0.389997,0.224106
211858,recently - jim croce,12,2011,singer-songwriter,0.690,0.214,0,-17.495,1,0.0537,...,8,18,that i used to be 'cause recently it,0.034753,0.017376,0.028584,0.039856,0.353018,0.131015,0.395397


In [5]:
# non-sentiment regression
non_sentiment_columns = [col for col in train_df.columns if col not in sentiment_columns + ['window_size', 'stride', 'window_index', 'window_text']]
train_df_non = train_df[non_sentiment_columns]
tune_df_non = tune_df[non_sentiment_columns]
test_df_non = test_df[non_sentiment_columns]

# aggregate data by song_artist for non-sentiment features
def aggregate_data(df):
    return df.groupby(['song_artist', 'genre']).mean().reset_index()

train_agg = aggregate_data(train_df_non)
tune_agg = aggregate_data(tune_df_non)
test_agg = aggregate_data(test_df_non)

# train regression models by genre
genres = train_agg['genre'].unique()
models = {}
genre_performance = {}

for genre in genres:
    train_genre = train_agg[train_agg['genre'] == genre]
    test_genre = test_agg[test_agg['genre'] == genre]
    
    # prepare features and labels
    X_train = train_genre.drop(columns=['song_artist', 'genre', 'popularity'])
    y_train = train_genre['popularity']
    X_test = test_genre.drop(columns=['song_artist', 'genre', 'popularity'])
    y_test = test_genre['popularity']
    
    # linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    models[genre] = model
    
    # evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    genre_performance[genre] = {'MSE': mse, 'R^2': r2}

# summarize performance
print("\nGenre-wise Model Performance:")
for genre, performance in genre_performance.items():
    print(f"Genre: {genre}, MSE: {performance['MSE']:.4f}, R^2: {performance['R^2']:.4f}")



Genre-wise Model Performance:
Genre: funk, MSE: 64.0811, R^2: 0.0150
Genre: disco, MSE: 136.5690, R^2: 0.0517
Genre: dance, MSE: 168.0464, R^2: 0.1643
Genre: alt-rock, MSE: 99.0345, R^2: 0.0174
Genre: hard-rock, MSE: 95.5835, R^2: 0.1071
Genre: country, MSE: 123.0018, R^2: 0.1596
Genre: pop, MSE: 144.8851, R^2: 0.0245
Genre: folk, MSE: 115.3945, R^2: 0.1509
Genre: blues, MSE: 88.8052, R^2: 0.0707
Genre: singer-songwriter, MSE: 54.5092, R^2: 0.1215


In [6]:
# baseline sentiment regression
# flatten the 20x7 sentiment data into 1x140 features per song
def flatten_sentiment_data(df, sentiment_columns):
    def flatten_song(song_df):
        return song_df.sort_values('window_index')[sentiment_columns].values.flatten()

    # group by song_artist and apply the flatten function
    flattened_features = df.groupby(['song_artist'], group_keys=False).apply(flatten_song)
    flattened_columns = [f"{emotion}_{i}" for i in range(20) for emotion in sentiment_columns]
    flattened_df = pd.DataFrame(flattened_features.tolist(), columns=flattened_columns, index=flattened_features.index)

    # add metadata columns (genre, popularity) back to the df
    metadata = df[['song_artist', 'genre', 'popularity']].drop_duplicates().set_index('song_artist')
    final_df = metadata.join(flattened_df)

    return final_df.reset_index()

train_flattened = flatten_sentiment_data(train_df, sentiment_columns)
tune_flattened = flatten_sentiment_data(tune_df, sentiment_columns)
test_flattened = flatten_sentiment_data(test_df, sentiment_columns)

print(f"Train flattened shape: {train_flattened.shape}")
print(f"Tune flattened shape: {tune_flattened.shape}")
print(f"Test flattened shape: {test_flattened.shape}")

# train regression models by genre
genres = train_flattened['genre'].unique()
models = {}
genre_performance = {}

for genre in genres:
    train_genre = train_flattened[train_flattened['genre'] == genre]
    test_genre = test_flattened[test_flattened['genre'] == genre]
    
    if len(train_genre) < 5 or len(test_genre) < 5: 
        print(f"Skipping genre {genre} due to insufficient data.")
        continue

    # prepare features and labels
    X_train = train_genre.drop(columns=['song_artist', 'genre', 'popularity'])
    y_train = train_genre['popularity']
    X_test = test_genre.drop(columns=['song_artist', 'genre', 'popularity'])
    y_test = test_genre['popularity']
    
    # linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    models[genre] = model
    
    # evaluate
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    genre_performance[genre] = {'MSE': mse, 'R^2': r2}

print("\nGenre-wise Model Performance:")
for genre, performance in genre_performance.items():
    print(f"Genre: {genre}, MSE: {performance['MSE']:.4f}, R^2: {performance['R^2']:.4f}")


  flattened_features = df.groupby(['song_artist'], group_keys=False).apply(flatten_song)
  flattened_features = df.groupby(['song_artist'], group_keys=False).apply(flatten_song)


Train flattened shape: (4753, 143)
Tune flattened shape: (680, 143)
Test flattened shape: (1359, 143)

Genre-wise Model Performance:
Genre: alt-rock, MSE: 131.3872, R^2: -0.3036
Genre: blues, MSE: 169.0752, R^2: -0.7693
Genre: country, MSE: 163.6372, R^2: -0.1180
Genre: dance, MSE: 268.1320, R^2: -0.3335
Genre: disco, MSE: 522.3632, R^2: -2.6272
Genre: folk, MSE: 287.9866, R^2: -1.1190
Genre: funk, MSE: 158.8396, R^2: -1.4415
Genre: hard-rock, MSE: 146.2474, R^2: -0.3662
Genre: pop, MSE: 211.1714, R^2: -0.4217
Genre: singer-songwriter, MSE: 115.0302, R^2: -0.8539


  flattened_features = df.groupby(['song_artist'], group_keys=False).apply(flatten_song)
