In [7]:
from google.colab import userdata, drive
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

!kaggle datasets download -d rodolfofigueroa/spotify-12m-songs

! unzip "spotify-12m-songs.zip"

Dataset URL: https://www.kaggle.com/datasets/rodolfofigueroa/spotify-12m-songs
License(s): unknown
Downloading spotify-12m-songs.zip to /content
 90% 88.0M/97.4M [00:00<00:00, 914MB/s]
100% 97.4M/97.4M [00:00<00:00, 886MB/s]
Archive:  spotify-12m-songs.zip
  inflating: tracks_features.csv     


In [8]:
drive.mount("/content/drive")

Mounted at /content/drive


In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from tqdm import tqdm
import seaborn as sns
import ast

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset


In [10]:
pd.set_option('display.max_columns', None)
df = pd.read_csv("/content/tracks_features.csv")


df["artists"] = df["artists"].apply(lambda x: ast.literal_eval(x))
df["main_artist"] = df["artists"].apply(lambda x: x[0] if x else None)

print(df.columns)
print(len(df.columns))
df.head()

Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date', 'main_artist'],
      dtype='object')
25


Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,main_artist
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,0.978,7,-5.399,1,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,Rage Against The Machine
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,0.957,11,-5.764,1,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,Rage Against The Machine
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,0.97,7,-5.424,1,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,Rage Against The Machine
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,0.967,11,-5.83,0,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,Rage Against The Machine
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,0.929,2,-6.729,1,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,Rage Against The Machine


In [11]:
meta_cols = ["name", "album", "main_artist", "track_number", "disc_number", "release_date", "key"]
feature_cols = ["explicit", "danceability", "energy", "loudness",
              "speechiness", "acousticness", "instrumentalness",
              "liveness", "valence", "tempo"]

In [12]:
df.loc[:, ~df.columns.isin(meta_cols + feature_cols)]

Unnamed: 0,id,album_id,artists,artist_ids,mode,duration_ms,time_signature,year
0,7lmeHLHBe4nmXzuXc0HDjk,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],1,210133,4.0,1999
1,1wsRitfRRtWyEapl0q22o8,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],1,206200,4.0,1999
2,1hR0fIFK2qRG3f3RF70pb7,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],1,298893,4.0,1999
3,2lbASgTSoDO7MTuLAXlTW0,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],0,213640,4.0,1999
4,1MQTmpYOZ6fcMQc56Hdo7T,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],['2d0hyoQ5ynDBnkvAbJKORj'],1,205600,4.0,1999
...,...,...,...,...,...,...,...,...
1204020,0EsMifwUmMfJZxzoMPXJKZ,38O5Ys0W9PFS5K7dMb7yKb,[FVLCRVM],['7AjItKsRnEYRSiBt2OxK1y'],0,276213,4.0,2014
1204021,2WSc2TB1CSJgGE0PEzVeiu,38O5Ys0W9PFS5K7dMb7yKb,[FVLCRVM],['7AjItKsRnEYRSiBt2OxK1y'],0,363179,4.0,2014
1204022,6iProIgUe3ETpO6UT0v5Hg,38O5Ys0W9PFS5K7dMb7yKb,[FVLCRVM],['7AjItKsRnEYRSiBt2OxK1y'],0,385335,4.0,2014
1204023,37B4SXC8uoBsUyKCWnhPfX,38O5Ys0W9PFS5K7dMb7yKb,[FVLCRVM],['7AjItKsRnEYRSiBt2OxK1y'],0,324455,4.0,2014


In [13]:
numeric_transformer = make_pipeline(
    StandardScaler()
)
cat_transformer = make_pipeline(
    OneHotEncoder(drop="if_binary")
)

feature_preprocessor = make_column_transformer(
    (numeric_transformer, make_column_selector(dtype_include=np.number)),
    (cat_transformer, make_column_selector(dtype_exclude=np.number)),
    remainder="drop"
)

class TrackPreprocessor():
  def __init__(self, preprocessor):

    self.preprocessor = preprocessor
    self.pca = PCA(whiten=True)


  def clean_data(self, df):
    df = df.dropna().drop_duplicates(subset=["name", "main_artist", "year"]).copy()

    return df


  def fit_transform(self, df, whiten):
    X = self.preprocessor.fit_transform(df)
    X = self.pca.fit_transform(X)

    X = pd.DataFrame(X)

    return X

  def transform(self, df, whiten):
      X = self.preprocessor.transform(df)
      X = self.pca.transform(X)

      X = pd.DataFrame(X)

      return X


track_preprocessor = TrackPreprocessor(feature_preprocessor)

In [14]:
cleaned_df = track_preprocessor.clean_data(df.sample(frac=1, random_state=5330))

meta_df = cleaned_df[meta_cols]
feature_df = cleaned_df[feature_cols]

X = track_preprocessor.fit_transform(feature_df, True)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.844154,-0.433272,-1.17906,-0.098168,2.060223,1.430463,0.130121,-0.224651,1.220989,-0.371878
1,0.91691,-0.183143,0.112213,-0.54441,-0.342717,-0.150677,1.397204,-0.473737,-0.157092,3.797265
2,-0.054467,0.095909,-0.753939,1.807115,-1.206196,-0.594574,-0.224142,0.042384,1.322532,0.033469
3,-1.822013,-0.764473,0.297403,0.1023,0.624735,0.356276,-0.332814,0.453374,-0.272904,0.204088
4,-1.536298,-0.021844,0.153684,0.084611,-0.872356,-1.601389,-0.048672,-2.542586,0.014689,-0.07596


In [15]:
meta_df

Unnamed: 0,name,album,main_artist,track_number,disc_number,release_date,key
728290,Struktur,Triadic Ballet,Metroland,10,1,2015-04-16,10
1110871,Distraction,SweetSexySavage (Deluxe),Kehlani,3,1,2017-01-27,5
211148,Nobody's Fault But Mine,Freedom Highway,The Staple Singers,16,1,1991-07-09,11
246548,Chanson de matin,Music For Weddings,Edward Elgar,13,1,2006-02-17,7
758957,Suite No. 4 in C Major: Ayre,Dancing in the Isles,Musica Pacifica,18,1,2010-10-20,11
...,...,...,...,...,...,...,...
309471,Rednote,Element,Rednote,2,1,2007-08-01,0
1038750,The World's Greatest,Chocolate Milk,ScribbleMonster & His Pals,4,1,2004-01-01,2
1042768,Ush-Ush,Once Upon a Time,Flying Pooh,31,1,2018-12-31,2
255083,Fandango (arr. F. Werle),Fiesta!,Frank Perkins,7,1,2012-03-01,8


In [16]:
X.shape

(1176724, 10)

In [17]:
cleaned_df

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date,main_artist
728290,23sdz79mJ6dqiTGRITxxfm,Struktur,Triadic Ballet,6IbFLSrdaA7EwYYYNzC0ut,[Metroland],['2jkEEDisTNyfxfkLbez8N0'],10,1,False,0.723,0.94400,10,-8.492,0,0.0723,0.00873,0.918000,0.0666,0.6590,119.985,221347,4.0,2015,2015-04-16,Metroland
1110871,2wSQyp6VzUopSFBinRo1iD,Distraction,SweetSexySavage (Deluxe),4B4in9QlrlYWSHlYSRebdC,[Kehlani],['0cGUm45nv7Z6M6qdXYQGTX'],3,1,True,0.660,0.69800,5,-6.266,0,0.0467,0.02660,0.000000,0.2960,0.3960,120.959,235693,4.0,2017,2017-01-27,Kehlani
211148,4XEyQa6jYr5v6DWnfnDtHD,Nobody's Fault But Mine,Freedom Highway,5613XvsC5XhGi3gfNWXpjg,[The Staple Singers],['7xGGqA85UIWX1GoTVM4itC'],16,1,False,0.496,0.45000,11,-15.376,1,0.0404,0.76800,0.011400,0.1210,0.5530,172.407,107533,4.0,1991,1991-07-09,The Staple Singers
246548,5EmETgfPMGjmgAwuOlexGE,Chanson de matin,Music For Weddings,0hBG1EopaqfIqRvZHhcas7,"[Edward Elgar, Britten Sinfonia, Nicholas Cleo...","['430byzy0c5bPn5opiu0SRd', '3P1VtkpIYbw6YoTo0K...",13,1,False,0.189,0.14800,7,-20.678,1,0.0439,0.92700,0.941000,0.1620,0.0617,102.788,176093,4.0,2006,2006-02-17,Edward Elgar
758957,4SzITysmYDDBrGPkOowA4j,Suite No. 4 in C Major: Ayre,Dancing in the Isles,5NXCOyUueIw4LG1gXqjQaH,[Musica Pacifica],['01nVVeYv7zavrY48wDhGFz'],18,1,False,0.231,0.00763,11,-27.002,1,0.0443,0.58200,0.000001,0.0917,0.0764,113.106,273253,4.0,2010,2010-10-20,Musica Pacifica
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309471,4NLhJXZI1tB0Xn0mPTdpaY,Rednote,Element,5VegmyqeEJftvko5cnZIhU,[Rednote],['5PhWRVt9ESEY6iD9Knp5G8'],2,1,False,0.741,0.28800,0,-13.791,1,0.8070,0.42700,0.000000,0.5760,0.9730,84.015,24187,3.0,2007,2007-08-01,Rednote
1038750,6c9oFAknaKcg3zigBrcdDK,The World's Greatest,Chocolate Milk,393RILZ2vZ8WThITnyocQi,[ScribbleMonster & His Pals],['4VbUTOWdbcIHRaPz7msunL'],4,1,False,0.459,0.93300,2,-3.066,1,0.1020,0.05450,0.472000,0.0944,0.5630,164.879,172267,4.0,2004,2004-01-01,ScribbleMonster & His Pals
1042768,5bA5zyJ27sDuUvhNXvVnzS,Ush-Ush,Once Upon a Time,3USopaTKXQrZ7Wb5OfuNRD,[Flying Pooh],['44rRG7sFbHGo7iI0jfeQaU'],31,1,False,0.528,0.76800,2,-8.679,1,0.0931,0.23200,0.000128,0.3820,0.7460,142.603,254133,4.0,2018,2018-12-31,Flying Pooh
255083,04TBngtNl63armsX5KpVsI,Fandango (arr. F. Werle),Fiesta!,7tseAK7YvqPaeIBkHPVPOf,"[Frank Perkins, Floyd Werle, Dallas Wind Symph...","['1RJSiJV6YS5nwyHYDy5e6r', '2MFgiUlTTMRdLpqzhV...",7,1,False,0.223,0.08000,8,-24.602,1,0.0406,0.70400,0.680000,0.2860,0.0524,147.457,238493,3.0,2012,2012-03-01,Frank Perkins


In [18]:
data_path = "drive/MyDrive/Education/STAT 5330 Project/data"

feature_fp = "phase_1/encoder_training_data.csv"
meta_fp = "general/track_metadata.csv"
full_fp = "general/tracks_features.csv"

feature_path = os.path.join(data_path, feature_fp)
meta_path = os.path.join(data_path, meta_fp)
full_path = os.path.join(data_path, full_fp)

X.to_csv(feature_path, index=False)
meta_df.to_csv(meta_path, index=False)
cleaned_df.to_csv(full_path, index=False)