In [4]:
# Import dependencies
import pandas as pd
import numpy as np
import sklearn
import category_encoders as ce
from sklearn.preprocessing import MinMaxScaler

In [5]:
# Loads the data into a DataFrame
df = pd.read_csv("../Resources/dataset.csv")

# Removes the csv index (duplicate) column
df = df[['artists', 'track_id', 'album_name','track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre']]
df.head()  

Unnamed: 0,artists,track_id,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,5SuOikwiRyPMVoIQDJUgSV,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,4qPNDBW1i3p13qLCt0Ki3A,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,6lfxq3CG4xtTiEg7opyCyx,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,5vjLSffimiIP26QG5WcN2K,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [6]:
# Extract the 'track_genre' column
track_genre_column = df['track_genre']

# Split words into individual tokens and extract unique words
unique_words = track_genre_column.str.split().explode().unique()

print(unique_words)

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

In [7]:
unique_words = ['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass',
                'blues', 'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill',
                'classical', 'club', 'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house',
                'detroit-techno', 'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro',
                'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth',
                'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
                'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian', 'indie-pop', 'indie',
                'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids',
                'latin', 'latino', 'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
                'new-age', 'opera', 'pagode', 'party', 'piano', 'pop-film', 'pop', 'power-pop',
                'progressive-house', 'psych-rock', 'punk-rock', 'punk', 'r-n-b', 'reggae', 'reggaeton',
                'rock-n-roll', 'rock', 'rockabilly', 'romance', 'sad', 'salsa', 'samba', 'sertanejo',
                'show-tunes', 'singer-songwriter', 'ska', 'sleep', 'songwriter', 'soul', 'spanish', 'study',
                'swedish', 'synth-pop', 'tango', 'techno', 'trance', 'trip-hop', 'turkish', 'world-music']

# Filter the track_genre column to only include unique words
filtered_genre = track_genre_column[track_genre_column.isin(unique_words)]

# Calculate the frequency count of each word
word_counts = filtered_genre.value_counts()

print(word_counts)

acoustic             1000
punk-rock            1000
progressive-house    1000
power-pop            1000
pop                  1000
                     ... 
folk                 1000
emo                  1000
electronic           1000
electro              1000
world-music          1000
Name: track_genre, Length: 114, dtype: int64


In [8]:
# Loads the data into a DataFrame
df = pd.read_csv("../Resources/dataset.csv")

# Removes the csv index (duplicate) column
df = df[['artists', 'track_id', 'album_name','track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre']]
df.head()    

Unnamed: 0,artists,track_id,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,5SuOikwiRyPMVoIQDJUgSV,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,4qPNDBW1i3p13qLCt0Ki3A,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,6lfxq3CG4xtTiEg7opyCyx,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,5vjLSffimiIP26QG5WcN2K,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [9]:
df = df.drop_duplicates(ignore_index=True)
df.shape

(113550, 20)

In [10]:
# Remove duration = 0
df=df[df['duration_ms'] != 0]
# Reset the index
df = df.reset_index(drop=True)

# Convert duration to minutes
df['duration_min'] = df['duration_ms'] / 60000

# Bin duration
# Set up a list of bins
duration_bins = [1, 2, 3, 4, 5, 6]
# Set up list of conditions
duration_conditions = [
    (df["duration_min"] < 2), 
    (df["duration_min"] >= 2) & (df["duration_min"] < 3),
    (df["duration_min"] >= 3) & (df["duration_min"] < 4),
    (df["duration_min"] >= 4) & (df["duration_min"] < 5),
    (df["duration_min"] >= 5) & (df["duration_min"] < 6),
    (df["duration_min"] >= 6)
]
# Set up the column with bins
df["duration_binned"] = np.select(duration_conditions, duration_bins)

# Confirm binning
df['duration_binned'].value_counts()

3    42269
2    25955
4    22836
5     8574
6     7737
1     6178
Name: duration_binned, dtype: int64

In [11]:
# Select and keep only primary artist
df["primary_artist"] = df['artists'].str.split(";").str[0]
# Determine the number of artists for each track and make a column for this
df["number_artists"] = df['artists'].apply(lambda x: len(str(x).split(";")))
df.head()

Unnamed: 0,artists,track_id,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_min,duration_binned,primary_artist,number_artists
0,Gen Hoshino,5SuOikwiRyPMVoIQDJUgSV,Comedy,Comedy,73,230666,False,0.676,0.461,1,...,1e-06,0.358,0.715,87.917,4,acoustic,3.844433,3,Gen Hoshino,1
1,Ben Woodward,4qPNDBW1i3p13qLCt0Ki3A,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,...,6e-06,0.101,0.267,77.489,4,acoustic,2.4935,2,Ben Woodward,1
2,Ingrid Michaelson;ZAYN,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,...,0.0,0.117,0.12,76.332,4,acoustic,3.513767,3,Ingrid Michaelson,2
3,Kina Grannis,6lfxq3CG4xtTiEg7opyCyx,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,...,7.1e-05,0.132,0.143,181.74,3,acoustic,3.36555,3,Kina Grannis,1
4,Chord Overstreet,5vjLSffimiIP26QG5WcN2K,Hold On,Hold On,82,198853,False,0.618,0.443,2,...,0.0,0.0829,0.167,119.949,4,acoustic,3.314217,3,Chord Overstreet,1


In [12]:
print(df.columns)

Index(['artists', 'track_id', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre', 'duration_min',
       'duration_binned', 'primary_artist', 'number_artists'],
      dtype='object')


In [13]:
# Remove the specified columns
columns_to_drop = ["artists", "track_id", "album_name"]
df_drop = df.drop(columns=columns_to_drop)

# Print the updated DataFrame
df_drop.head()

Unnamed: 0,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,...,instrumentalness,liveness,valence,tempo,time_signature,track_genre,duration_min,duration_binned,primary_artist,number_artists
0,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,...,1e-06,0.358,0.715,87.917,4,acoustic,3.844433,3,Gen Hoshino,1
1,Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,...,6e-06,0.101,0.267,77.489,4,acoustic,2.4935,2,Ben Woodward,1
2,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,...,0.0,0.117,0.12,76.332,4,acoustic,3.513767,3,Ingrid Michaelson,2
3,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,...,7.1e-05,0.132,0.143,181.74,3,acoustic,3.36555,3,Kina Grannis,1
4,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,...,0.0,0.0829,0.167,119.949,4,acoustic,3.314217,3,Chord Overstreet,1


In [14]:
print(df_drop.columns)

Index(['track_name', 'popularity', 'duration_ms', 'explicit', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre', 'duration_min', 'duration_binned', 'primary_artist',
       'number_artists'],
      dtype='object')


In [15]:
# Performs one-hot-encoding on: time_signature, key, binned # of artists, and binned duration
pd.set_option('display.max_columns', None)
encoded_df = pd.get_dummies(df_drop, columns=['time_signature', 'key', 'duration_binned'])
encoded_df

Unnamed: 0,track_name,popularity,duration_ms,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,duration_min,primary_artist,number_artists,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,duration_binned_1,duration_binned_2,duration_binned_3,duration_binned_4,duration_binned_5,duration_binned_6
0,Comedy,73,230666,False,0.676,0.4610,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,acoustic,3.844433,Gen Hoshino,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,Ghost - Acoustic,55,149610,False,0.420,0.1660,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,acoustic,2.493500,Ben Woodward,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,To Begin Again,57,210826,False,0.438,0.3590,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,acoustic,3.513767,Ingrid Michaelson,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,Can't Help Falling In Love,71,201933,False,0.266,0.0596,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,acoustic,3.365550,Kina Grannis,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,Hold On,82,198853,False,0.618,0.4430,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,acoustic,3.314217,Chord Overstreet,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,Sleep My Little Boy,21,384999,False,0.172,0.2350,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,world-music,6.416650,Rainy Lullaby,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
113545,Water Into Light,22,385000,False,0.174,0.1170,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,world-music,6.416667,Rainy Lullaby,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
113546,Miss Perfumado,22,271466,False,0.629,0.3290,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,world-music,4.524433,Cesária Evora,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
113547,Friends,41,283893,False,0.587,0.5060,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,world-music,4.731550,Michael W. Smith,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [16]:
# Convert T/F columns to 1/0
encoded_df[['explicit','time_signature_0', 'time_signature_1',
       'time_signature_3', 'time_signature_4', 'time_signature_5', 'key_0',
       'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8',
       'key_9', 'key_10', 'key_11', 'duration_binned_1',
       'duration_binned_2', 'duration_binned_3', 'duration_binned_4',
       'duration_binned_5', 'duration_binned_6']] = encoded_df[['explicit','time_signature_0', 'time_signature_1',
       'time_signature_3', 'time_signature_4', 'time_signature_5', 'key_0',
       'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8',
       'key_9', 'key_10', 'key_11', 'duration_binned_1',
       'duration_binned_2', 'duration_binned_3', 'duration_binned_4',
       'duration_binned_5', 'duration_binned_6']].astype(int).reset_index(drop=True)
encoded_df.shape

(113549, 41)

In [17]:
encoded_df

Unnamed: 0,track_name,popularity,duration_ms,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,duration_min,primary_artist,number_artists,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,duration_binned_1,duration_binned_2,duration_binned_3,duration_binned_4,duration_binned_5,duration_binned_6
0,Comedy,73,230666,0,0.676,0.4610,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,acoustic,3.844433,Gen Hoshino,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,Ghost - Acoustic,55,149610,0,0.420,0.1660,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,acoustic,2.493500,Ben Woodward,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,To Begin Again,57,210826,0,0.438,0.3590,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,acoustic,3.513767,Ingrid Michaelson,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,Can't Help Falling In Love,71,201933,0,0.266,0.0596,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,acoustic,3.365550,Kina Grannis,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,Hold On,82,198853,0,0.618,0.4430,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,acoustic,3.314217,Chord Overstreet,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,Sleep My Little Boy,21,384999,0,0.172,0.2350,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,world-music,6.416650,Rainy Lullaby,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
113545,Water Into Light,22,385000,0,0.174,0.1170,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,world-music,6.416667,Rainy Lullaby,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
113546,Miss Perfumado,22,271466,0,0.629,0.3290,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,world-music,4.524433,Cesária Evora,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
113547,Friends,41,283893,0,0.587,0.5060,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,world-music,4.731550,Michael W. Smith,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [18]:
# Remove the specified columns
columns_to_drop = ["track_name", "duration_ms", "duration_min", "primary_artist"]
df_drop_durations = encoded_df.drop(columns=columns_to_drop)

# Print the updated DataFrame
df_drop_durations

Unnamed: 0,popularity,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,number_artists,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,duration_binned_1,duration_binned_2,duration_binned_3,duration_binned_4,duration_binned_5,duration_binned_6
0,73,0,0.676,0.4610,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,acoustic,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,55,0,0.420,0.1660,-17.235,1,0.0763,0.9240,0.000006,0.1010,0.2670,77.489,acoustic,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,57,0,0.438,0.3590,-9.734,1,0.0557,0.2100,0.000000,0.1170,0.1200,76.332,acoustic,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,71,0,0.266,0.0596,-18.515,1,0.0363,0.9050,0.000071,0.1320,0.1430,181.740,acoustic,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,82,0,0.618,0.4430,-9.681,1,0.0526,0.4690,0.000000,0.0829,0.1670,119.949,acoustic,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,21,0,0.172,0.2350,-16.393,1,0.0422,0.6400,0.928000,0.0863,0.0339,125.995,world-music,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
113545,22,0,0.174,0.1170,-18.318,0,0.0401,0.9940,0.976000,0.1050,0.0350,85.239,world-music,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
113546,22,0,0.629,0.3290,-10.895,0,0.0420,0.8670,0.000000,0.0839,0.7430,132.378,world-music,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
113547,41,0,0.587,0.5060,-10.889,1,0.0297,0.3810,0.000000,0.2700,0.4130,135.960,world-music,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [19]:
# df_drop_durations.to_csv('test_dataset.csv', index=False)

In [20]:
column_list = df_drop_durations.columns.tolist()
print(column_list)

['popularity', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'track_genre', 'number_artists', 'time_signature_0', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'duration_binned_1', 'duration_binned_2', 'duration_binned_3', 'duration_binned_4', 'duration_binned_5', 'duration_binned_6']


In [21]:
# Create a scaler object
scaler = MinMaxScaler(feature_range=(0, 1))

In [24]:
# Scale 'popularity'
popularity_values = df_drop_durations['popularity'].values.reshape(-1, 1)
scaled_popularity = scaler.fit_transform(popularity_values)
df_drop_durations['popularity'] = scaled_popularity

In [26]:
# Scale 'loudness'
loudness_values = df_drop_durations['loudness'].values.reshape(-1, 1)
scaled_loudness = scaler.fit_transform(loudness_values)
df_drop_durations['loudness'] = scaled_loudness

In [27]:
# Scale 'tempo'
tempo_values = df_drop_durations['tempo'].values.reshape(-1, 1)
scaled_tempo = scaler.fit_transform(tempo_values)
df_drop_durations['tempo'] = scaled_tempo

In [28]:
# Update the 'popularity' column in the DataFrame with the scaled values
df_drop_durations['popularity'] = scaled_popularity
df_drop_durations

Unnamed: 0,popularity,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,number_artists,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,duration_binned_1,duration_binned_2,duration_binned_3,duration_binned_4,duration_binned_5,duration_binned_6
0,0.73,0,0.676,0.4610,0.791392,0,0.1430,0.0322,0.000001,0.3580,0.7150,0.361245,acoustic,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0.55,0,0.420,0.1660,0.597377,1,0.0763,0.9240,0.000006,0.1010,0.2670,0.318397,acoustic,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0.57,0,0.438,0.3590,0.736123,1,0.0557,0.2100,0.000000,0.1170,0.1200,0.313643,acoustic,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0.71,0,0.266,0.0596,0.573701,1,0.0363,0.9050,0.000071,0.1320,0.1430,0.746758,acoustic,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0.82,0,0.618,0.4430,0.737103,1,0.0526,0.4690,0.000000,0.0829,0.1670,0.492863,acoustic,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,0.21,0,0.172,0.2350,0.612952,1,0.0422,0.6400,0.928000,0.0863,0.0339,0.517705,world-music,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
113545,0.22,0,0.174,0.1170,0.577345,0,0.0401,0.9940,0.976000,0.1050,0.0350,0.350242,world-music,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
113546,0.22,0,0.629,0.3290,0.714648,0,0.0420,0.8670,0.000000,0.0839,0.7430,0.543933,world-music,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
113547,0.41,0,0.587,0.5060,0.714759,1,0.0297,0.3810,0.000000,0.2700,0.4130,0.558651,world-music,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [29]:
column_list = df_drop_durations.columns.tolist()
print(column_list)

['popularity', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'track_genre', 'number_artists', 'time_signature_0', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'duration_binned_1', 'duration_binned_2', 'duration_binned_3', 'duration_binned_4', 'duration_binned_5', 'duration_binned_6']


In [216]:
# df_drop_genres.to_csv('output.csv', index=False)

In [30]:
# Get unique words in 'track_genre' column
unique_genres = df_drop_durations['track_genre'].unique()

# Display the unique genres
print(unique_genres)

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

In [33]:
# Get the count of unique genres
unique_genre_count = df_drop_durations['track_genre'].nunique()

# Print the sum of unique genres
print("Sum of unique genres:", unique_genre_count)

Sum of unique genres: 114


In [38]:
# List of unique genres
genres = ['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass', 'blues',
          'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill', 'classical', 'club',
          'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 'detroit-techno', 'disco', 'disney',
          'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
          'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard-rock',
          'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian', 'indie-pop',
          'indie', 'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin',
          'latino', 'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb', 'new-age', 'opera', 'pagode',
          'party', 'piano', 'pop-film', 'pop', 'power-pop', 'progressive-house', 'psych-rock', 'punk-rock', 'punk',
          'r-n-b', 'reggae', 'reggaeton', 'rock-n-roll', 'rock', 'rockabilly', 'romance', 'sad', 'salsa', 'samba',
          'sertanejo', 'show-tunes', 'singer-songwriter', 'ska', 'sleep', 'songwriter', 'soul', 'spanish', 'study',
          'swedish', 'synth-pop', 'tango', 'techno', 'trance', 'trip-hop', 'turkish', 'world-music']

# Create a dictionary to map genres to numbers
genre_mapping = {genre: i for i, genre in enumerate(genres)}

# Print the genre mapping
for genre, code in genre_mapping.items():
    print(genre, code)

acoustic 0
afrobeat 1
alt-rock 2
alternative 3
ambient 4
anime 5
black-metal 6
bluegrass 7
blues 8
brazil 9
breakbeat 10
british 11
cantopop 12
chicago-house 13
children 14
chill 15
classical 16
club 17
comedy 18
country 19
dance 20
dancehall 21
death-metal 22
deep-house 23
detroit-techno 24
disco 25
disney 26
drum-and-bass 27
dub 28
dubstep 29
edm 30
electro 31
electronic 32
emo 33
folk 34
forro 35
french 36
funk 37
garage 38
german 39
gospel 40
goth 41
grindcore 42
groove 43
grunge 44
guitar 45
happy 46
hard-rock 47
hardcore 48
hardstyle 49
heavy-metal 50
hip-hop 51
honky-tonk 52
house 53
idm 54
indian 55
indie-pop 56
indie 57
industrial 58
iranian 59
j-dance 60
j-idol 61
j-pop 62
j-rock 63
jazz 64
k-pop 65
kids 66
latin 67
latino 68
malay 69
mandopop 70
metal 71
metalcore 72
minimal-techno 73
mpb 74
new-age 75
opera 76
pagode 77
party 78
piano 79
pop-film 80
pop 81
power-pop 82
progressive-house 83
psych-rock 84
punk-rock 85
punk 86
r-n-b 87
reggae 88
reggaeton 89
rock-n-roll 90
roc

In [41]:
# Add a new column 'genre_code' to the DataFrame and populate it with the genre codes
df_drop_durations['genre_code'] = df_drop_durations['track_genre'].map(genre_mapping)
df_drop_durations

Unnamed: 0,popularity,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,number_artists,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,duration_binned_1,duration_binned_2,duration_binned_3,duration_binned_4,duration_binned_5,duration_binned_6,genre_code
0,0.73,0,0.676,0.4610,0.791392,0,0.1430,0.0322,0.000001,0.3580,0.7150,0.361245,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
1,0.55,0,0.420,0.1660,0.597377,1,0.0763,0.9240,0.000006,0.1010,0.2670,0.318397,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
2,0.57,0,0.438,0.3590,0.736123,1,0.0557,0.2100,0.000000,0.1170,0.1200,0.313643,0,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
3,0.71,0,0.266,0.0596,0.573701,1,0.0363,0.9050,0.000071,0.1320,0.1430,0.746758,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
4,0.82,0,0.618,0.4430,0.737103,1,0.0526,0.4690,0.000000,0.0829,0.1670,0.492863,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,0.21,0,0.172,0.2350,0.612952,1,0.0422,0.6400,0.928000,0.0863,0.0339,0.517705,113,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,
113545,0.22,0,0.174,0.1170,0.577345,0,0.0401,0.9940,0.976000,0.1050,0.0350,0.350242,113,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
113546,0.22,0,0.629,0.3290,0.714648,0,0.0420,0.8670,0.000000,0.0839,0.7430,0.543933,113,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
113547,0.41,0,0.587,0.5060,0.714759,1,0.0297,0.3810,0.000000,0.2700,0.4130,0.558651,113,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,


In [45]:
# Drop the 'genre_code' column
drop_genre_code = df_drop_durations.drop('genre_code', axis=1)

# Print the DataFrame to see the changes
drop_genre_code

Unnamed: 0,popularity,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_genre,number_artists,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,duration_binned_1,duration_binned_2,duration_binned_3,duration_binned_4,duration_binned_5,duration_binned_6
0,0.73,0,0.676,0.4610,0.791392,0,0.1430,0.0322,0.000001,0.3580,0.7150,0.361245,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0.55,0,0.420,0.1660,0.597377,1,0.0763,0.9240,0.000006,0.1010,0.2670,0.318397,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0.57,0,0.438,0.3590,0.736123,1,0.0557,0.2100,0.000000,0.1170,0.1200,0.313643,0,2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0.71,0,0.266,0.0596,0.573701,1,0.0363,0.9050,0.000071,0.1320,0.1430,0.746758,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0.82,0,0.618,0.4430,0.737103,1,0.0526,0.4690,0.000000,0.0829,0.1670,0.492863,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113544,0.21,0,0.172,0.2350,0.612952,1,0.0422,0.6400,0.928000,0.0863,0.0339,0.517705,113,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
113545,0.22,0,0.174,0.1170,0.577345,0,0.0401,0.9940,0.976000,0.1050,0.0350,0.350242,113,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
113546,0.22,0,0.629,0.3290,0.714648,0,0.0420,0.8670,0.000000,0.0839,0.7430,0.543933,113,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
113547,0.41,0,0.587,0.5060,0.714759,1,0.0297,0.3810,0.000000,0.2700,0.4130,0.558651,113,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0


In [47]:
drop_genre_code.to_csv('scaled_data.csv', index=False)

In [48]:
column_list = drop_genre_code.columns.tolist()
print(column_list)

['popularity', 'explicit', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'track_genre', 'number_artists', 'time_signature_0', 'time_signature_1', 'time_signature_3', 'time_signature_4', 'time_signature_5', 'key_0', 'key_1', 'key_2', 'key_3', 'key_4', 'key_5', 'key_6', 'key_7', 'key_8', 'key_9', 'key_10', 'key_11', 'duration_binned_1', 'duration_binned_2', 'duration_binned_3', 'duration_binned_4', 'duration_binned_5', 'duration_binned_6']


In [None]:
#hyperperameter tuning, neural network, logistic regression