In [49]:
#importing the libraries and reading the data
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy import stats


In [40]:
#reading the data
df = pd.read_csv('spotify_Song_Dataset/dataset.csv')

#removing the row with null values
df = df.dropna()

#dropping duration and explicit columns 
df = df.drop(['duration_ms','explicit','mode','liveness','loudness','time_signature','key'],axis=1)

df.rename(columns={'Unnamed: 0': 'song_id'}, inplace=True)


df.head(5)


Unnamed: 0,song_id,track_id,artists,album_name,track_name,popularity,danceability,energy,speechiness,acousticness,instrumentalness,valence,tempo,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,0.676,0.461,0.143,0.0322,1e-06,0.715,87.917,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,0.42,0.166,0.0763,0.924,6e-06,0.267,77.489,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,0.438,0.359,0.0557,0.21,0.0,0.12,76.332,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,0.266,0.0596,0.0363,0.905,7.1e-05,0.143,181.74,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,0.618,0.443,0.0526,0.469,0.0,0.167,119.949,acoustic


In [48]:
# Scaling the audio features and popularity
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']])

# Convert the scaled features back to a DataFrame
df_scaled = pd.DataFrame(scaled_features, columns=['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo'])

# Add the original columns back to the DataFrame
df_scaled['song_id'] = df['song_id']
df_scaled['track_id'] = df['track_id']
df_scaled['artist_name'] = df['artists']
df_scaled['track_name'] = df['track_name']
df_scaled['album_name'] = df['album_name']
df_scaled['track_genre'] = df['track_genre']

# Encoding the track_genre
label_encoder = LabelEncoder()
df_scaled['track_genre_encoded'] = label_encoder.fit_transform(df['track_genre'])

# Handling outliers
z_scores = stats.zscore(df_scaled[['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']])
outliers = (z_scores > 3) | (z_scores < -3)
df_scaled = df_scaled[(~outliers).all(axis=1)]

# Handling missing data
df_scaled = df_scaled.dropna() 

# Re-scaling the audio features and popularity after handling outliers
scaled_features = scaler.fit_transform(df_scaled[['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']])
df_scaled[['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']] = scaled_features

# Display the final processed DataFrame
df_scaled.head(5)


KeyError: 'song_id'

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Assuming your DataFrame is named df

# Scaling the audio features and popularity
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo']])

# Convert the scaled features back to a DataFrame
df_scaled = pd.DataFrame(scaled_features, columns=['popularity', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'tempo'])

# Add the original columns back to the DataFrame
df_scaled['song_id'] = df['song_id']
df_scaled['track_id'] = df['track_id']
df_scaled['artist_name'] = df['artists']
df_scaled['track_name'] = df['track_name']
df_scaled['album_name'] = df['album_name']
df_scaled['track_genre'] = df['track_genre']

# encoding the track_genre
label_encoder = LabelEncoder()
df['track_genre_encoded'] = label_encoder.fit_transform(df['track_genre'])

# Add the encoded column to the DataFrame
df_scaled['track_genre_encoded'] = df['track_genre_encoded']

df_scaled.head(5)


Unnamed: 0,popularity,danceability,energy,speechiness,acousticness,instrumentalness,valence,tempo,song_id,track_id,artist_name,track_name,album_name,track_genre,track_genre_encoded
0,1.782624,0.629239,-0.717147,0.551843,-0.850193,-0.504111,0.929315,-1.141854,0.0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,acoustic,0.0
1,0.975625,-0.845908,-1.889974,-0.078995,1.831744,-0.504097,-0.798681,-1.489708,1.0,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost - Acoustic,Ghost (Acoustic),acoustic,0.0
2,1.065291,-0.742187,-1.122667,-0.273827,-0.315489,-0.504115,-1.365679,-1.528303,2.0,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,acoustic,0.0
3,1.692957,-1.733301,-2.312987,-0.457309,1.774605,-0.503886,-1.276965,1.987857,3.0,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Can't Help Falling In Love,Crazy Rich Asians (Original Motion Picture Sou...,acoustic,0.0
4,2.186123,0.295026,-0.788709,-0.303146,0.463409,-0.504115,-1.184394,-0.073343,4.0,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,acoustic,0.0
