In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../data/SpotifyFeatures.csv')

# Preview the data
df.head()


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [2]:
# Check the shape, columns, and data types
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
df.info()


Shape: (232725, 18)

Columns: ['genre', 'artist_name', 'track_name', 'track_id', 'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232725 entries, 0 to 232724
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   genre             232725 non-null  object 
 1   artist_name       232725 non-null  object 
 2   track_name        232724 non-null  object 
 3   track_id          232725 non-null  object 
 4   popularity        232725 non-null  int64  
 5   acousticness      232725 non-null  float64
 6   danceability      232725 non-null  float64
 7   duration_ms       232725 non-null  int64  
 8   energy            232725 non-null  float64
 9   instrumentalness  232725 non-null  float64
 10  key               232725 non-null  object 
 

In [3]:
# Missing values
print("Missing values:\n", df.isnull().sum())

# Duplicates
print("Number of duplicate rows:", df.duplicated().sum())


Missing values:
 genre               0
artist_name         0
track_name          1
track_id            0
popularity          0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
speechiness         0
tempo               0
time_signature      0
valence             0
dtype: int64
Number of duplicate rows: 0


In [4]:
print("Top 10 genres:")
print(df['genre'].value_counts().head(10))

print("\nTop 10 artists:")
print(df['artist_name'].value_counts().head(10))


Top 10 genres:
genre
Comedy              9681
Soundtrack          9646
Indie               9543
Jazz                9441
Pop                 9386
Electronic          9377
Children’s Music    9353
Folk                9299
Hip-Hop             9295
Rock                9272
Name: count, dtype: int64

Top 10 artists:
artist_name
Giuseppe Verdi             1394
Giacomo Puccini            1137
Kimbo Children's Music      971
Nobuo Uematsu               825
Richard Wagner              804
Wolfgang Amadeus Mozart     800
Randy Newman                757
Georges Bizet               701
Juice Music                 684
Johann Sebastian Bach       632
Name: count, dtype: int64


In [5]:
# Create a combined text field for SBERT
df['combined_text'] = df.apply(
    lambda row: f"Artist: {row['artist_name']}, Track: {row['track_name']}, Genre: {row['genre']}", axis=1
)
df['combined_text'] = df['combined_text'].str.lower().str.replace('[^a-z0-9,:\s]', '', regex=True)
df['combined_text'].head()


  df['combined_text'] = df['combined_text'].str.lower().str.replace('[^a-z0-9,:\s]', '', regex=True)


0    artist: henri salvador, track: cest beau de fa...
1    artist: martin  les fes, track: perdu davance ...
2    artist: joseph williams, track: dont let me be...
3    artist: henri salvador, track: dismoi monsieur...
4    artist: fabien nataf, track: ouverture, genre:...
Name: combined_text, dtype: object

In [6]:
df.to_csv('../data/SpotifyFeatures_cleaned.csv', index=False)
print("Cleaned data saved!")


Cleaned data saved!
