In [2]:
import pandas as pd

In [None]:
df_new = pd.read_csv('../datasets/songs.csv')

In [5]:
df_new.columns

Index(['id', 'name', 'album_name', 'artists', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'lyrics', 'year',
       'genre', 'popularity', 'total_artist_followers',
       'avg_artist_popularity', 'artist_ids', 'niche_genres'],
      dtype='object')

In [8]:
df_2025 = df_raw[df_raw['year'] == 2025].copy()
print(f"2025 songs: {len(df_2025)}")
print(df_2025['popularity'].describe())


2025 songs: 2
count     2.000000
mean     31.000000
std       1.414214
min      30.000000
25%      30.500000
50%      31.000000
75%      31.500000
max      32.000000
Name: popularity, dtype: float64


In [10]:
df_2025.isna().sum().sum()

np.int64(0)

In [7]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

df_raw = pd.read_csv('../datasets/songs.csv')

def preprocess_and_evaluate(df_raw, model_path, columns_path, model_name):
    df = df_raw.copy()
    
    # ---- RENAME TO MATCH TRAINING ----
    df = df.rename(columns={'year': 'release_year'})
    
    # ---- NO EXPLICIT COLUMN — DEFAULT TO 0 ----
    df['explicit'] = 0
    
    # ---- DROP ROWS WITH MISSING POPULARITY ----
    df = df.dropna(subset=['popularity'])
    
    # ---- OHE ----
    df_ohe = pd.get_dummies(df, columns=['key', 'mode'])
    df_ohe['explicit_0'] = 1
    df_ohe['explicit_1'] = 0

    # ---- ALIGN COLUMNS ----
    model = joblib.load(model_path)
    model_columns = joblib.load(columns_path)
    X = df_ohe.reindex(columns=model_columns, fill_value=0).fillna(0)

    # ---- VERIFY ----
    print(f"Shape: {X.shape} | Nulls: {X.isnull().sum().sum()}")

    # ---- BUCKET ACTUAL POPULARITY ----
    df['actual_bucket'] = pd.cut(
        df['popularity'],
        bins=[0, 25, 50, 100],
        labels=['Low', 'Medium', 'High']
    )

    # ---- PREDICT ----
    predictions = model.predict(X)
    df['predicted_bucket'] = predictions

    # ---- RESULTS ----
    valid = df['actual_bucket'].notna()
    print(f"\n{'='*50}")
    print(f"Model: {model_name}")
    print(f"{'='*50}")
    print(classification_report(
        df.loc[valid, 'actual_bucket'],
        df.loc[valid, 'predicted_bucket']
    ))

    # ---- SPOT CHECK ----
    print(df[['name', 'artists', 'popularity', 
              'actual_bucket', 'predicted_bucket']].head(10))
    
    return df

# ---- RUN ALL 4 MODELS ----
results_rf28 = preprocess_and_evaluate(
    df_raw,
    '../models/rf_28.pkl',
    '../models/model_columns.pkl',
    'Random Forest 28 Features'
)

results_rf11 = preprocess_and_evaluate(
    df_raw,
    '../models/rf_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Random Forest 11 Features'
)

results_ens28 = preprocess_and_evaluate(
    df_raw,
    '../models/ensemble_28.pkl',
    '../models/model_columns.pkl',
    'Ensemble 28 Features'
)

results_ens11 = preprocess_and_evaluate(
    df_raw,
    '../models/ensemble_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Ensemble 11 Features'
)

Shape: (550622, 28) | Nulls: 0

Model: Random Forest 28 Features
              precision    recall  f1-score   support

        High       0.16      0.49      0.24     27881
         Low       0.62      0.28      0.38    231523
      Medium       0.33      0.49      0.39    142440

    accuracy                           0.37    401844
   macro avg       0.37      0.42      0.34    401844
weighted avg       0.48      0.37      0.38    401844

                                           name                artists  \
0                                             !           ["HELLYEAH"]   
1                                            !!            ["Yxngxr1"]   
2                           !!Noble Stabbings!!     ["Dillinger Four"]   
3                                !I'll Be Back!         ["Ril\u00e8s"]   
4                                        !Lost!         ["Ril\u00e8s"]   
5                     !Que Vida! - Mono Version               ["Love"]   
6                                   

In [11]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

df_raw = pd.read_csv('../datasets/songs.csv')

# Filter to 2025 only
df_2025 = df_raw[df_raw['year'] == 2025].copy()
print(f"2025 songs: {len(df_2025)}")
print(df_2025['popularity'].describe())

def preprocess_and_evaluate(df_raw, model_path, columns_path, model_name):
    df = df_raw.copy()

    # ---- RENAME TO MATCH TRAINING ----
    df = df.rename(columns={'year': 'release_year'})

    # ---- NO EXPLICIT COLUMN — DEFAULT TO 0 ----
    df['explicit'] = 0

    # ---- DROP ROWS WITH MISSING POPULARITY ----
    df = df.dropna(subset=['popularity'])

    # ---- OHE ----
    df_ohe = pd.get_dummies(df, columns=['key', 'mode'])
    df_ohe['explicit_0'] = 1
    df_ohe['explicit_1'] = 0

    # ---- ALIGN COLUMNS ----
    model = joblib.load(model_path)
    model_columns = joblib.load(columns_path)
    X = df_ohe.reindex(columns=model_columns, fill_value=0).fillna(0)

    # ---- VERIFY ----
    print(f"Shape: {X.shape} | Nulls: {X.isnull().sum().sum()}")

    # ---- BUCKET ACTUAL POPULARITY ----
    df['actual_bucket'] = pd.cut(
        df['popularity'],
        bins=[0, 25, 50, 100],
        labels=['Low', 'Medium', 'High']
    )

    # ---- PREDICT ----
    predictions = model.predict(X)
    df['predicted_bucket'] = predictions

    # ---- RESULTS ----
    valid = df['actual_bucket'].notna()
    print(f"\n{'='*50}")
    print(f"Model: {model_name}")
    print(f"{'='*50}")
    print(classification_report(
        df.loc[valid, 'actual_bucket'],
        df.loc[valid, 'predicted_bucket']
    ))

    # ---- SPOT CHECK ----
    print(df[['name', 'artists', 'popularity',
              'actual_bucket', 'predicted_bucket']].head(10))

    return df

# ---- RUN ALL 4 MODELS ----
results_rf28 = preprocess_and_evaluate(
    df_2025,
    '../models/rf_28.pkl',
    '../models/model_columns.pkl',
    'Random Forest 28 Features'
)

results_rf11 = preprocess_and_evaluate(
    df_2025,
    '../models/rf_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Random Forest 11 Features'
)

results_ens28 = preprocess_and_evaluate(
    df_2025,
    '../models/ensemble_28.pkl',
    '../models/model_columns.pkl',
    'Ensemble 28 Features'
)

results_ens11 = preprocess_and_evaluate(
    df_2025,
    '../models/ensemble_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Ensemble 11 Features'
)

2025 songs: 2
count     2.000000
mean     31.000000
std       1.414214
min      30.000000
25%      30.500000
50%      31.000000
75%      31.500000
max      32.000000
Name: popularity, dtype: float64
Shape: (2, 28) | Nulls: 0

Model: Random Forest 28 Features
              precision    recall  f1-score   support

         Low       0.00      0.00      0.00       0.0
      Medium       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

                              name           artists  popularity  \
131350                 Earthwalker  ["Radikal Guru"]          30   
163209  Fer Sure S3RL REMIX - S3rl   ["TMD", "S3RL"]          32   

       actual_bucket predicted_bucket  
131350        Medium              Low  
163209        Medium              Low  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Shape: (2, 11) | Nulls: 0

Model: Random Forest 11 Features
              precision    recall  f1-score   support

         Low       0.00      0.00      0.00       0.0
      Medium       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

                              name           artists  popularity  \
131350                 Earthwalker  ["Radikal Guru"]          30   
163209  Fer Sure S3RL REMIX - S3rl   ["TMD", "S3RL"]          32   

       actual_bucket predicted_bucket  
131350        Medium              Low  
163209        Medium              Low  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Shape: (2, 28) | Nulls: 0

Model: Ensemble 28 Features
              precision    recall  f1-score   support

         Low       0.00      0.00      0.00       0.0
      Medium       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

                              name           artists  popularity  \
131350                 Earthwalker  ["Radikal Guru"]          30   
163209  Fer Sure S3RL REMIX - S3rl   ["TMD", "S3RL"]          32   

       actual_bucket predicted_bucket  
131350        Medium              Low  
163209        Medium              Low  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Shape: (2, 11) | Nulls: 0

Model: Ensemble 11 Features
              precision    recall  f1-score   support

         Low       0.00      0.00      0.00       0.0
      Medium       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0

                              name           artists  popularity  \
131350                 Earthwalker  ["Radikal Guru"]          30   
163209  Fer Sure S3RL REMIX - S3rl   ["TMD", "S3RL"]          32   

       actual_bucket predicted_bucket  
131350        Medium              Low  
163209        Medium              Low  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
df_2025

Unnamed: 0,id,name,album_name,artists,danceability,energy,key,loudness,mode,speechiness,...,tempo,duration_ms,lyrics,year,genre,popularity,total_artist_followers,avg_artist_popularity,artist_ids,niche_genres
131350,4M5NkB7cUsiIammerNmCmy,Earthwalker,Subconscious,"[""Radikal Guru""]",0.729,0.824,7,-6.145,1,0.038,...,139.997,289714,hello hi\n my name is jack\n i have three foot...,2025,Electronic,30,30636,30.0,"[""5yGoMjmASLRvhNL7jZndxG""]","[""dub"", ""ragga"", ""reggae""]"
163209,60MMucOXIrQ5beS2HwhGtt,Fer Sure S3RL REMIX - S3rl,The Medic Droid (Original Singles),"[""TMD"", ""S3RL""]",0.532,0.992,1,-5.941,1,0.0756,...,149.985,203389,"Fer sure, maybe, fer sure, not\n Fer sure, eh-...",2025,Electronic,32,532275,44.0,"[""4XHS0yjkCDJDEtIf73FW0q"", ""11aa081aKYUzmeFm0y...","[""crunk"", ""happy hardcore"", ""hyperpop"", ""night..."


In [17]:
 import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

df_raw = pd.read_csv('../datasets/songs.csv')

# Filter to 2022-2023, drop 0 popularity
df_recent = df_raw[df_raw['year'].between(2022, 2023)].copy()
df_recent = df_recent[df_recent['popularity'] > 0]
print(f"2022-2023 songs (no zero popularity): {len(df_recent)}")
print(f"Year breakdown:\n{df_recent['year'].value_counts().sort_index()}")
print(df_recent['popularity'].describe())

def preprocess_and_evaluate(df_raw, model_path, columns_path, model_name):
    df = df_raw.copy()

    # ---- RENAME TO MATCH TRAINING ----
    df = df.rename(columns={'year': 'release_year'})

    # ---- NO EXPLICIT COLUMN — DEFAULT TO 0 ----
    df['explicit'] = 0

    # ---- DROP ROWS WITH MISSING POPULARITY ----
    df = df.dropna(subset=['popularity'])

    # ---- USE QUANTILE BUCKETS TO MATCH THIS DATASET'S DISTRIBUTION ----
    df['actual_bucket'] = pd.qcut(
        df['popularity'],
        q=3,
        labels=['Low', 'Medium', 'High']
    )
    print(f"Bucket distribution:\n{df['actual_bucket'].value_counts()}")

    # ---- OHE ----
    df_ohe = pd.get_dummies(df, columns=['key', 'mode'])
    df_ohe['explicit_0'] = 1
    df_ohe['explicit_1'] = 0

    # ---- ALIGN COLUMNS ----
    model = joblib.load(model_path)
    model_columns = joblib.load(columns_path)
    X = df_ohe.reindex(columns=model_columns, fill_value=0).fillna(0)

    # ---- VERIFY ----
    print(f"Shape: {X.shape} | Nulls: {X.isnull().sum().sum()}")

    # ---- PREDICT ----
    predictions = model.predict(X)
    df['predicted_bucket'] = predictions

    # ---- RESULTS ----
    valid = df['actual_bucket'].notna()
    print(f"\n{'='*50}")
    print(f"Model: {model_name}")
    print(f"{'='*50}")
    print(classification_report(
        df.loc[valid, 'actual_bucket'],
        df.loc[valid, 'predicted_bucket']
    ))

    # ---- SPOT CHECK ----
    print(df[['name', 'artists', 'popularity',
              'actual_bucket', 'predicted_bucket']].head(10))

    return df

# ---- RUN ALL 4 MODELS ----
results_rf28 = preprocess_and_evaluate(
    df_recent,
    '../models/rf_28.pkl',
    '../models/model_columns.pkl',
    'Random Forest 28 Features'
)

results_rf11 = preprocess_and_evaluate(
    df_recent,
    '../models/rf_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Random Forest 11 Features'
)

results_ens28 = preprocess_and_evaluate(
    df_recent,
    '../models/ensemble_28.pkl',
    '../models/model_columns.pkl',
    'Ensemble 28 Features'
)

results_ens11 = preprocess_and_evaluate(
    df_recent,
    '../models/ensemble_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Ensemble 11 Features'
)

2022-2023 songs (no zero popularity): 17302
Year breakdown:
year
2022    13175
2023     4127
Name: count, dtype: int64
count    17302.000000
mean        27.982603
std         16.488072
min          1.000000
25%         15.000000
50%         27.000000
75%         39.000000
max         85.000000
Name: popularity, dtype: float64
Bucket distribution:
actual_bucket
Medium    5995
Low       5785
High      5522
Name: count, dtype: int64
Shape: (17302, 28) | Nulls: 0

Model: Random Forest 28 Features
              precision    recall  f1-score   support

        High       0.37      0.81      0.51      5522
         Low       0.45      0.37      0.40      5785
      Medium       0.29      0.02      0.05      5995

    accuracy                           0.39     17302
   macro avg       0.37      0.40      0.32     17302
weighted avg       0.37      0.39      0.31     17302

                                                  name  \
14                           "40" - Songs Of Surrender   
36   

In [20]:
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.utils import resample

df_2022 = df_raw[df_raw['year'] == 2022].copy()
df_2022 = df_2022[df_2022['popularity'] > 0]

print(f"2022 songs (no zero popularity): {len(df_2022)}")
print(f"\nPopularity distribution:")
print(df_2022['popularity'].describe())

# Check raw bucket distribution
df_2022['actual_bucket'] = pd.cut(
    df_2022['popularity'],
    bins=[0, 25, 50, 100],
    labels=['Low', 'Medium', 'High']
)
print(f"\nRaw bucket distribution:")
print(df_2022['actual_bucket'].value_counts())
print(df_2022['actual_bucket'].value_counts(normalize=True).round(3))

# ---- RESAMPLE TO MATCH TRAINING PROPORTIONS ----
df_low = df_recent[df_recent['actual_bucket'] == 'Low']
df_med = df_recent[df_recent['actual_bucket'] == 'Medium']
df_high = df_recent[df_recent['actual_bucket'] == 'High']

print(f"\nClass sizes before resampling:")
print(f"Low:    {len(df_low)}")
print(f"Medium: {len(df_med)}")
print(f"High:   {len(df_high)}")

n_total = len(df_recent)

df_low_resampled = resample(df_low,   n_samples=int(n_total * 0.26), random_state=42)
df_med_resampled = resample(df_med,   n_samples=int(n_total * 0.55), random_state=42)
df_high_resampled = resample(df_high, n_samples=int(n_total * 0.20), random_state=42)

df_balanced = pd.concat([df_low_resampled, df_med_resampled, df_high_resampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nResampled distribution:")
print(df_balanced['actual_bucket'].value_counts())
print(df_balanced['actual_bucket'].value_counts(normalize=True).round(3))

# ---- PREPROCESS AND EVALUATE FUNCTION ----
def preprocess_and_evaluate(df_input, model_path, columns_path, model_name):
    df = df_input.copy()

    # ---- RENAME TO MATCH TRAINING ----
    df = df.rename(columns={'year': 'release_year'})

    # ---- NO EXPLICIT COLUMN — DEFAULT TO 0 ----
    df['explicit'] = 0

    # ---- DROP ROWS WITH MISSING POPULARITY ----
    df = df.dropna(subset=['popularity'])

    # ---- OHE ----
    df_ohe = pd.get_dummies(df, columns=['key', 'mode'])
    df_ohe['explicit_0'] = 1
    df_ohe['explicit_1'] = 0

    # ---- ALIGN COLUMNS ----
    model = joblib.load(model_path)
    model_columns = joblib.load(columns_path)
    X = df_ohe.reindex(columns=model_columns, fill_value=0).fillna(0)

    # ---- VERIFY ----
    print(f"Shape: {X.shape} | Nulls: {X.isnull().sum().sum()}")

    # ---- PREDICT ----
    predictions = model.predict(X)
    df['predicted_bucket'] = predictions

    # ---- RESULTS ----
    valid = df['actual_bucket'].notna()
    print(f"\n{'='*50}")
    print(f"Model: {model_name}")
    print(f"{'='*50}")
    print(classification_report(
        df.loc[valid, 'actual_bucket'],
        df.loc[valid, 'predicted_bucket']
    ))

    # ---- SPOT CHECK ----
    print(df[['name', 'artists', 'popularity',
              'actual_bucket', 'predicted_bucket']].head(10))

    return df

# ---- RUN ALL 4 MODELS ----
results_rf28 = preprocess_and_evaluate(
    df_balanced,
    '../models/rf_28.pkl',
    '../models/model_columns.pkl',
    'Random Forest 28 Features'
)

results_rf11 = preprocess_and_evaluate(
    df_balanced,
    '../models/rf_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Random Forest 11 Features'
)

results_ens28 = preprocess_and_evaluate(
    df_balanced,
    '../models/ensemble_28.pkl',
    '../models/model_columns.pkl',
    'Ensemble 28 Features'
)

results_ens11 = preprocess_and_evaluate(
    df_balanced,
    '../models/ensemble_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Ensemble 11 Features'
)

2022 songs (no zero popularity): 13175

Popularity distribution:
count    13175.000000
mean        29.739051
std         16.462610
min          1.000000
25%         17.000000
50%         29.000000
75%         41.000000
max         85.000000
Name: popularity, dtype: float64

Raw bucket distribution:
actual_bucket
Medium    6177
Low       5501
High      1497
Name: count, dtype: int64
actual_bucket
Medium    0.469
Low       0.418
High      0.114
Name: proportion, dtype: float64

Class sizes before resampling:
Low:    8031
Medium: 7574
High:   1697

Resampled distribution:
actual_bucket
Medium    9516
Low       4498
High      3460
Name: count, dtype: int64
actual_bucket
Medium    0.545
Low       0.257
High      0.198
Name: proportion, dtype: float64
Shape: (17474, 28) | Nulls: 0

Model: Random Forest 28 Features
              precision    recall  f1-score   support

        High       0.23      0.85      0.36      3460
         Low       0.39      0.36      0.37      4498
      Medium     

In [21]:
df_2022 = df_raw[df_raw['year'] == 2022].copy()
df_2022 = df_2022[df_2022['popularity'] > 0]

# ---- BUCKET ----
df_2022['actual_bucket'] = pd.cut(
    df_2022['popularity'],
    bins=[0, 25, 50, 100],
    labels=['Low', 'Medium', 'High']
)

# ---- RESAMPLE TO MATCH TRAINING PROPORTIONS ----
df_low   = df_2022[df_2022['actual_bucket'] == 'Low']
df_med   = df_2022[df_2022['actual_bucket'] == 'Medium']
df_high  = df_2022[df_2022['actual_bucket'] == 'High']

print(f"Class sizes before resampling:")
print(f"Low:    {len(df_low)}")
print(f"Medium: {len(df_med)}")
print(f"High:   {len(df_high)}")

n_total = len(df_2022)

df_low_resampled  = resample(df_low,  n_samples=int(n_total * 0.26), random_state=42)
df_med_resampled  = resample(df_med,  n_samples=int(n_total * 0.55), random_state=42)
df_high_resampled = resample(df_high, n_samples=int(n_total * 0.20), random_state=42)

df_balanced_2022 = pd.concat([df_low_resampled, df_med_resampled, df_high_resampled])
df_balanced_2022 = df_balanced_2022.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nResampled distribution:")
print(df_balanced_2022['actual_bucket'].value_counts())

# ---- RUN ALL 4 MODELS ----
results_rf28 = preprocess_and_evaluate(
    df_balanced_2022,
    '../models/rf_28.pkl',
    '../models/model_columns.pkl',
    'Random Forest 28 Features'
)

results_rf11 = preprocess_and_evaluate(
    df_balanced_2022,
    '../models/rf_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Random Forest 11 Features'
)

results_ens28 = preprocess_and_evaluate(
    df_balanced_2022,
    '../models/ensemble_28.pkl',
    '../models/model_columns.pkl',
    'Ensemble 28 Features'
)

results_ens11 = preprocess_and_evaluate(
    df_balanced_2022,
    '../models/ensemble_11_clean.pkl',
    '../models/model_columns_clean.pkl',
    'Ensemble 11 Features'
)

Class sizes before resampling:
Low:    5501
Medium: 6177
High:   1497

Resampled distribution:
actual_bucket
Medium    7246
Low       3425
High      2635
Name: count, dtype: int64
Shape: (13306, 28) | Nulls: 0

Model: Random Forest 28 Features
              precision    recall  f1-score   support

        High       0.23      0.85      0.36      2635
         Low       0.39      0.35      0.37      3425
      Medium       0.49      0.03      0.05      7246

    accuracy                           0.27     13306
   macro avg       0.37      0.41      0.26     13306
weighted avg       0.41      0.27      0.19     13306

                                       name                          artists  \
0  Love Runs Out - from One Night In Malibu                  ["OneRepublic"]   
1                                 penn hall                ["Origami Angel"]   
2                         The Floor Is Lava         ["Teddi Gold", "Coolio"]   
3                  Ms.Communication - Redux            