# Putting spotify tracks together

### IMPORT LIBRARIES

In [1]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import config

### SPOTIPY CONNECTION

In [2]:
client_credentials_manager = SpotifyClientCredentials(client_id=config.client_id, client_secret=config.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=120)

In [3]:
# CHECK SEARCH RESULTS
h = sp.search(q='year:2021', type='track', limit=50)

In [4]:
t= h["tracks"]["items"]

In [5]:
t

[{'album': {'album_type': 'album',
   'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/4oUHIQIBe0LHzYfvXNW4QM'},
     'href': 'https://api.spotify.com/v1/artists/4oUHIQIBe0LHzYfvXNW4QM',
     'id': '4oUHIQIBe0LHzYfvXNW4QM',
     'name': 'Morgan Wallen',
     'type': 'artist',
     'uri': 'spotify:artist:4oUHIQIBe0LHzYfvXNW4QM'}],
   'available_markets': ['AD',
    'AE',
    'AG',
    'AL',
    'AM',
    'AO',
    'AR',
    'AT',
    'AU',
    'AZ',
    'BA',
    'BB',
    'BD',
    'BE',
    'BF',
    'BG',
    'BH',
    'BI',
    'BJ',
    'BN',
    'BO',
    'BR',
    'BS',
    'BT',
    'BW',
    'BY',
    'BZ',
    'CA',
    'CD',
    'CG',
    'CH',
    'CI',
    'CL',
    'CM',
    'CO',
    'CR',
    'CV',
    'CW',
    'CY',
    'CZ',
    'DE',
    'DJ',
    'DK',
    'DM',
    'DO',
    'DZ',
    'EC',
    'EE',
    'EG',
    'ES',
    'ET',
    'FI',
    'FJ',
    'FM',
    'FR',
    'GA',
    'GB',
    'GD',
    'GE',
    'GH',
    'GM',
    'GN',
 

In [6]:
# track_id
t[0]["id"]

'3cBsEDNhFI9E82vPj3kvi3'

In [7]:
# track_name
t[0]["name"]

'Wasted On You'

In [8]:
# track_popularity
t[0]["popularity"]

88

In [9]:
# artist_name
t[0]["artists"][0]["name"]

'Morgan Wallen'

In [10]:
# function to get track info from the search
def get_tracks(to_search):
    track_id = []
    track_name = []
    track_popularity = []
    artist_id = []
    artist_name = []
    
    for i in (range(0,1000,50)):
        search = sp.search(q= to_search, type="track", limit=50, offset = i)
        for i, t in enumerate(search['tracks']['items']):
            track_id.append(t["id"])
            track_name.append(t["name"])
            track_popularity.append(t["popularity"])
            artist_name.append(t["artists"][0]['name'])
            artist_id.append(t["artists"][0]['id'])

    
    dict = {"track_id": track_id, "track_name": track_name, "track_popularity": track_popularity, "artist_id": artist_id, "artist_name": artist_name}
            
    return dict

**We will get add songs from 2020 to 2023 to the full song list from kaggle.**

In [11]:
latest_songs = pd.DataFrame()
for i in list(["year 2020", "year 2021", "year 2022", "year 2023"]):
    y = get_tracks(i)
    new_df = pd.DataFrame.from_dict(y)
    latest_songs = pd.concat([latest_songs, new_df])

In [12]:
latest_songs.head()

Unnamed: 0,track_id,track_name,track_popularity,artist_id,artist_name
0,0MlkZnfwsjwHHSfn07N8ao,Longest Year (2020),19,0VOR7Ie9xUSb45fzIIVJQ1,Hammock
1,4vaSRWOwLipBtCKPNL31WV,Year 2020,10,1edCXsyYTTXcwtSIdsTAzb,Razinn
2,0t2q2c43AATY4z1GI883Be,Summer 2020,56,5ZS223C6JyBfXasXxrRqOk,Jhené Aiko
3,0qSXoNHcm1sn9kWcAG7WCx,Ghosts N Goblins,25,6JMGrupbzJZ3yuQhTGyeHr,Year 200X
4,6dEPod8uKS1oqcD7RZYy6c,Year 2020,5,3p18oCfd7ERzBpfjjFArjV,Ruined Conflict


In [13]:
latest_songs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   track_id          4000 non-null   object
 1   track_name        4000 non-null   object
 2   track_popularity  4000 non-null   int64 
 3   artist_id         4000 non-null   object
 4   artist_name       4000 non-null   object
dtypes: int64(1), object(4)
memory usage: 187.5+ KB


In [14]:
latest_songs.duplicated().sum()

1027

In [15]:
latest = latest_songs.drop_duplicates()

In [16]:
latest.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2973 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   track_id          2973 non-null   object
 1   track_name        2973 non-null   object
 2   track_popularity  2973 non-null   int64 
 3   artist_id         2973 non-null   object
 4   artist_name       2973 non-null   object
dtypes: int64(1), object(4)
memory usage: 139.4+ KB


### GET AUDIO FEATURES OF LATEST SONGS

In [17]:
def audio_features(track_ids_list):
    audio_features = pd.DataFrame()
    for i, o in zip(range(0, (len(track_ids_list)+100), 100), range(100, (len(track_ids_list)+100), 100)):
        try:
            features = sp.audio_features(track_ids_list[i:o])
            features_df = pd.DataFrame(features)
            audio_features = pd.concat([audio_features, features_df])
        except AttributeError:
            pass
    return audio_features

In [18]:
audio_feat = audio_features(latest.track_id)

In [19]:
audio_feat

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature
0,0.0705,0.2860,2,-14.865,1,0.0444,0.685000,0.675000,0.1870,0.0313,78.251,audio_features,0MlkZnfwsjwHHSfn07N8ao,spotify:track:0MlkZnfwsjwHHSfn07N8ao,https://api.spotify.com/v1/tracks/0MlkZnfwsjwH...,https://api.spotify.com/v1/audio-analysis/0Mlk...,533707,4
1,0.4150,0.0287,6,-27.175,0,0.0360,0.859000,0.851000,0.1080,0.0601,115.058,audio_features,4vaSRWOwLipBtCKPNL31WV,spotify:track:4vaSRWOwLipBtCKPNL31WV,https://api.spotify.com/v1/tracks/4vaSRWOwLipB...,https://api.spotify.com/v1/audio-analysis/4vaS...,82299,4
2,0.4010,0.4850,6,-10.787,0,0.2160,0.831000,0.000292,0.0907,0.5400,169.949,audio_features,0t2q2c43AATY4z1GI883Be,spotify:track:0t2q2c43AATY4z1GI883Be,https://api.spotify.com/v1/tracks/0t2q2c43AATY...,https://api.spotify.com/v1/audio-analysis/0t2q...,196000,4
3,0.2940,0.7880,7,-8.439,1,0.0441,0.000004,0.967000,0.0681,0.5410,93.028,audio_features,0qSXoNHcm1sn9kWcAG7WCx,spotify:track:0qSXoNHcm1sn9kWcAG7WCx,https://api.spotify.com/v1/tracks/0qSXoNHcm1sn...,https://api.spotify.com/v1/audio-analysis/0qSX...,106973,4
4,0.4050,0.6880,1,-7.562,1,0.0359,0.000947,0.637000,0.1880,0.0615,140.027,audio_features,6dEPod8uKS1oqcD7RZYy6c,spotify:track:6dEPod8uKS1oqcD7RZYy6c,https://api.spotify.com/v1/tracks/6dEPod8uKS1o...,https://api.spotify.com/v1/audio-analysis/6dEP...,264000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,0.4300,0.8680,9,-4.469,1,0.0358,0.629000,0.000549,0.4010,0.6330,82.504,audio_features,1O276eC6ltdMUKlOIvJQLh,spotify:track:1O276eC6ltdMUKlOIvJQLh,https://api.spotify.com/v1/tracks/1O276eC6ltdM...,https://api.spotify.com/v1/audio-analysis/1O27...,102111,4
69,0.5700,0.8980,6,-2.956,1,0.0865,0.146000,0.000000,0.1310,0.9620,188.057,audio_features,6Cl8ej1NqfeFKHIwnzSjEc,spotify:track:6Cl8ej1NqfeFKHIwnzSjEc,https://api.spotify.com/v1/tracks/6Cl8ej1NqfeF...,https://api.spotify.com/v1/audio-analysis/6Cl8...,179427,4
70,0.1740,0.5020,2,-15.824,1,0.0501,0.082500,0.000162,0.4730,0.2950,134.060,audio_features,0HXW1L4rvRh4Ihib3RuTdK,spotify:track:0HXW1L4rvRh4Ihib3RuTdK,https://api.spotify.com/v1/tracks/0HXW1L4rvRh4...,https://api.spotify.com/v1/audio-analysis/0HXW...,132000,4
71,0.5220,0.5800,7,-9.652,1,0.2560,0.630000,0.000002,0.7030,0.7100,129.106,audio_features,4Rk6lzJ6WS1dIxv8FyanLK,spotify:track:4Rk6lzJ6WS1dIxv8FyanLK,https://api.spotify.com/v1/tracks/4Rk6lzJ6WS1d...,https://api.spotify.com/v1/audio-analysis/4Rk6...,227367,4


In [20]:
cols_to_drop = ['type', 'uri', 'analysis_url', 'time_signature']
audio_features = audio_feat.drop(columns=cols_to_drop)

In [21]:
audio_features.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,track_href,duration_ms
0,0.0705,0.286,2,-14.865,1,0.0444,0.685,0.675,0.187,0.0313,78.251,0MlkZnfwsjwHHSfn07N8ao,https://api.spotify.com/v1/tracks/0MlkZnfwsjwH...,533707
1,0.415,0.0287,6,-27.175,0,0.036,0.859,0.851,0.108,0.0601,115.058,4vaSRWOwLipBtCKPNL31WV,https://api.spotify.com/v1/tracks/4vaSRWOwLipB...,82299
2,0.401,0.485,6,-10.787,0,0.216,0.831,0.000292,0.0907,0.54,169.949,0t2q2c43AATY4z1GI883Be,https://api.spotify.com/v1/tracks/0t2q2c43AATY...,196000
3,0.294,0.788,7,-8.439,1,0.0441,4e-06,0.967,0.0681,0.541,93.028,0qSXoNHcm1sn9kWcAG7WCx,https://api.spotify.com/v1/tracks/0qSXoNHcm1sn...,106973
4,0.405,0.688,1,-7.562,1,0.0359,0.000947,0.637,0.188,0.0615,140.027,6dEPod8uKS1oqcD7RZYy6c,https://api.spotify.com/v1/tracks/6dEPod8uKS1o...,264000


In [22]:
# merge audio features with track infos
full_latest = latest.merge(audio_features, left_on = "track_id", right_on = "id")

In [23]:
# save to csv for safeguarding
#full_latest.to_csv('songs_2020-3.csv', index=False)

In [24]:
# drop some redundant columns
connect = full_latest.drop(columns=["id", "artist_id"])

In [25]:
connect

Unnamed: 0,track_id,track_name,track_popularity,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_href,duration_ms
0,0MlkZnfwsjwHHSfn07N8ao,Longest Year (2020),19,Hammock,0.0705,0.2860,2,-14.865,1,0.0444,0.685000,0.675000,0.1870,0.0313,78.251,https://api.spotify.com/v1/tracks/0MlkZnfwsjwH...,533707
1,4vaSRWOwLipBtCKPNL31WV,Year 2020,10,Razinn,0.4150,0.0287,6,-27.175,0,0.0360,0.859000,0.851000,0.1080,0.0601,115.058,https://api.spotify.com/v1/tracks/4vaSRWOwLipB...,82299
2,0t2q2c43AATY4z1GI883Be,Summer 2020,56,Jhené Aiko,0.4010,0.4850,6,-10.787,0,0.2160,0.831000,0.000292,0.0907,0.5400,169.949,https://api.spotify.com/v1/tracks/0t2q2c43AATY...,196000
3,0qSXoNHcm1sn9kWcAG7WCx,Ghosts N Goblins,25,Year 200X,0.2940,0.7880,7,-8.439,1,0.0441,0.000004,0.967000,0.0681,0.5410,93.028,https://api.spotify.com/v1/tracks/0qSXoNHcm1sn...,106973
4,6dEPod8uKS1oqcD7RZYy6c,Year 2020,5,Ruined Conflict,0.4050,0.6880,1,-7.562,1,0.0359,0.000947,0.637000,0.1880,0.0615,140.027,https://api.spotify.com/v1/tracks/6dEPod8uKS1o...,264000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2768,1O276eC6ltdMUKlOIvJQLh,Traditional Chinese New Year Music,0,Kyrylo Zaplotynskyi,0.4300,0.8680,9,-4.469,1,0.0358,0.629000,0.000549,0.4010,0.6330,82.504,https://api.spotify.com/v1/tracks/1O276eC6ltdM...,102111
2769,6Cl8ej1NqfeFKHIwnzSjEc,Christmas the Whole Year Round,2,Sabrina Carpenter,0.5700,0.8980,6,-2.956,1,0.0865,0.146000,0.000000,0.1310,0.9620,188.057,https://api.spotify.com/v1/tracks/6Cl8ej1NqfeF...,179427
2770,0HXW1L4rvRh4Ihib3RuTdK,Drown Cold Water Blues - 2023 Remastered Version,0,telesphore,0.1740,0.5020,2,-15.824,1,0.0501,0.082500,0.000162,0.4730,0.2950,134.060,https://api.spotify.com/v1/tracks/0HXW1L4rvRh4...,132000
2771,4Rk6lzJ6WS1dIxv8FyanLK,"2023 - Live at E54, West Chester, 12/8/2018",0,The Kitchen Thimbles,0.5220,0.5800,7,-9.652,1,0.2560,0.630000,0.000002,0.7030,0.7100,129.106,https://api.spotify.com/v1/tracks/4Rk6lzJ6WS1d...,227367


In [26]:
connect.shape # this dataframe has more columns so we will definitely drop some more to have the same attributes as the 160k songs

(2773, 17)

In [27]:
connect.columns

Index(['track_id', 'track_name', 'track_popularity', 'artist_name',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'track_href', 'duration_ms'],
      dtype='object')

In [28]:
concat = connect.drop(["track_href", "track_popularity"], axis=1)
concat.shape # now we have the same number of columns. we can proceed to concatenating the dataframes

(2773, 15)

In [29]:
list_cols = list(concat.columns) # columns to keep. we will use these to make sure we have the same columns from the df160k dataset
list_cols

['track_id',
 'track_name',
 'artist_name',
 'danceability',
 'energy',
 'key',
 'loudness',
 'mode',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'duration_ms']

### KAGGLE 160K DATASET

In [30]:
df_160k = pd.read_csv('160k_data.csv') # list of songs from kaggle from 1921 to june 2020
df_160k

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,mode,key,popularity,explicit
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,['Love'],220560,11/1/66,1966,0.5250,0.600,0.540,0.003050,0.1000,-11.803,0.0328,125.898,0.547,1,9,26,0
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",['U2'],157840,2/28/83,1983,0.2280,0.368,0.480,0.707000,0.1590,-11.605,0.0306,150.166,0.338,1,8,21,0
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",['U2'],226200,8/20/83,1983,0.0998,0.272,0.684,0.014500,0.9460,-9.728,0.0505,143.079,0.279,1,8,41,0
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.1850,0.371,0.545,0.582000,0.1830,-9.315,0.0307,150.316,0.310,1,8,37,0
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.1850,0.371,0.545,0.582000,0.1830,-9.315,0.0307,150.316,0.310,1,8,35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169902,4KiYPYpm4ultIp247ftPlI,텅 빈 마음 Empty Heart,['LEE SEUNG HWAN'],249893,10/15/89,1989,0.4290,0.547,0.304,0.000000,0.1430,-14.326,0.0336,74.164,0.157,0,11,35,0
169903,42tFTth2jcF7iSo0RBjfJF,피카부 Peek-A-Boo,['Red Velvet'],189050,11/17/17,2017,0.0868,0.839,0.902,0.002570,0.2720,-3.612,0.0536,114.953,0.639,0,1,69,0
169904,6XP9L7di5JnOc9WaeAW8oe,행복 (Happiness),['Red Velvet'],220987,8/1/14,2014,0.3230,0.727,0.871,0.000006,0.7250,-3.099,0.1580,121.438,0.681,0,10,58,0
169905,3Gpdzw72aBVJSrm5J1leVK,"헤어지지 못하는 여자, 떠나가지 못하는 남자 Can't Breakup Girl, C...","['Leessang', 'Jung In']",284627,10/6/09,2009,0.1500,0.523,0.841,0.000000,0.1640,-2.505,0.3310,173.903,0.383,0,10,52,0


In [31]:
# check columns
df_160k.columns

Index(['id', 'name', 'artists', 'duration_ms', 'release_date', 'year',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit'],
      dtype='object')

In [32]:
# rename columns to have the same as the latest songs dataset
df_160k.columns = ['track_id', 'track_name', 'artist_name', 'duration_ms', 'release_date', 'year',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'track_popularity', 'explicit']

In [33]:
# new df with the same columns as the concat df
spotify_160k = df_160k[list_cols]
spotify_160k.head()

Unnamed: 0,track_id,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,['Love'],0.6,0.54,9,-11.803,1,0.0328,0.525,0.00305,0.1,0.547,125.898,220560
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",['U2'],0.368,0.48,8,-11.605,1,0.0306,0.228,0.707,0.159,0.338,150.166,157840
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",['U2'],0.272,0.684,8,-9.728,1,0.0505,0.0998,0.0145,0.946,0.279,143.079,226200
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",['U2'],0.371,0.545,8,-9.315,1,0.0307,0.185,0.582,0.183,0.31,150.316,157667
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",['U2'],0.371,0.545,8,-9.315,1,0.0307,0.185,0.582,0.183,0.31,150.316,157667


### PLAYLIST TRACKS FROM PREVIOUS LAB

In [35]:
playlist_tracks = pd.read_csv('full_df_playlists.csv')
playlist_tracks.drop(['artist_id', 'id', 'time_signature'], axis=1, inplace=True)


In [None]:
playlist_tracks.head()

Unnamed: 0,track_id,track,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,5ghIJDpPoe3CfHMGu71E6T,Smells Like Teen Spirit,Nirvana,0.502,0.912,1,-4.556,1,0.0564,2.5e-05,0.000173,0.106,0.72,116.761,301920
1,1FkoVC85Ds3mFoK0fVqEqP,Self Esteem,The Offspring,0.488,0.862,0,-7.595,1,0.0476,0.0204,0.0,0.359,0.706,104.56,257827
2,59WN2psjkt1tyaxjspN8fp,Killing In The Name,Rage Against The Machine,0.466,0.833,7,-4.215,1,0.304,0.0266,0.0,0.0327,0.661,88.785,313573
3,6L89mwZXSOwYl76YXfX13s,Basket Case,Green Day,0.442,0.943,3,-3.205,1,0.0602,0.00293,9e-06,0.091,0.781,85.064,181533
4,3d9DChrdc6BOeFsbrZ3Is0,Under the Bridge,Red Hot Chili Peppers,0.559,0.345,4,-13.496,1,0.0459,0.0576,0.000105,0.141,0.458,84.581,264307


In [36]:
playlist_tracks.columns = ['track_id', 'track_name', 'artist_name', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms']

In [37]:
# putting all the tracks together
tracks_features = pd.concat([spotify_160k, concat, playlist_tracks])
tracks_features # we now have over 174K tracks. we need to check for any duplicates and null values

Unnamed: 0,track_id,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,['Love'],0.600,0.540,9,-11.803,1,0.0328,0.52500,0.003050,0.1000,0.547,125.898,220560
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",['U2'],0.368,0.480,8,-11.605,1,0.0306,0.22800,0.707000,0.1590,0.338,150.166,157840
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",['U2'],0.272,0.684,8,-9.728,1,0.0505,0.09980,0.014500,0.9460,0.279,143.079,226200
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",['U2'],0.371,0.545,8,-9.315,1,0.0307,0.18500,0.582000,0.1830,0.310,150.316,157667
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",['U2'],0.371,0.545,8,-9.315,1,0.0307,0.18500,0.582000,0.1830,0.310,150.316,157667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1698,3d0VZU6Mf2hdZRHA4zry8S,John Wayne,Lady Gaga,0.610,0.667,11,-5.825,0,0.0314,0.00131,0.000180,0.3400,0.510,96.954,174307
1699,4de1X6v99U7tfOXrNUCTbi,Candy,Doja Cat,0.689,0.516,8,-5.857,1,0.0444,0.51300,0.000000,0.1680,0.209,124.876,190907
1700,1GVF9369j7InydwGztCDIZ,Good In Goodbye,Madison Beer,0.658,0.698,11,-5.950,0,0.1770,0.43300,0.000000,0.1740,0.456,139.054,141949
1701,6qNB2ChCVPepl5ZjVJJTUW,STUPID (feat. Yung Baby Tate),Ashnikko,0.772,0.637,2,-6.881,1,0.1140,0.00459,0.000000,0.0778,0.540,149.906,167317


In [38]:
# clean artist_name column by removing special characters
tracks_features.artist_name = tracks_features.artist_name.apply(lambda x: x.translate({ord(i): None for i in """['"]"""}))

In [39]:
tracks_features

Unnamed: 0,track_id,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,Love,0.600,0.540,9,-11.803,1,0.0328,0.52500,0.003050,0.1000,0.547,125.898,220560
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",U2,0.368,0.480,8,-11.605,1,0.0306,0.22800,0.707000,0.1590,0.338,150.166,157840
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",U2,0.272,0.684,8,-9.728,1,0.0505,0.09980,0.014500,0.9460,0.279,143.079,226200
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",U2,0.371,0.545,8,-9.315,1,0.0307,0.18500,0.582000,0.1830,0.310,150.316,157667
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",U2,0.371,0.545,8,-9.315,1,0.0307,0.18500,0.582000,0.1830,0.310,150.316,157667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1698,3d0VZU6Mf2hdZRHA4zry8S,John Wayne,Lady Gaga,0.610,0.667,11,-5.825,0,0.0314,0.00131,0.000180,0.3400,0.510,96.954,174307
1699,4de1X6v99U7tfOXrNUCTbi,Candy,Doja Cat,0.689,0.516,8,-5.857,1,0.0444,0.51300,0.000000,0.1680,0.209,124.876,190907
1700,1GVF9369j7InydwGztCDIZ,Good In Goodbye,Madison Beer,0.658,0.698,11,-5.950,0,0.1770,0.43300,0.000000,0.1740,0.456,139.054,141949
1701,6qNB2ChCVPepl5ZjVJJTUW,STUPID (feat. Yung Baby Tate),Ashnikko,0.772,0.637,2,-6.881,1,0.1140,0.00459,0.000000,0.0778,0.540,149.906,167317


In [40]:
# check missing values
tracks_features.isna().sum() # there are none. YAY!

track_id            0
track_name          0
artist_name         0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
dtype: int64

In [41]:
# checking for duplicates
tracks_features.duplicated().sum() # some duplicates were found.

343

In [42]:
# remove duplicates
tracks_features = tracks_features.drop_duplicates()
tracks_features.shape

(174040, 15)

In [43]:
tracks_features[tracks_features.track_name == "Smells Like Teen Spirit"] # a problem encountered: I can definitely see duplicates but they have different track_ids!

Unnamed: 0,track_id,track_name,artist_name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
124213,2Wtm3yc0paBifvBSjOhO3b,Smells Like Teen Spirit,Nirvana,0.502,0.912,1,-4.556,1,0.0564,2.5e-05,0.000173,0.106,0.72,116.761,301920
124214,6y1etHPLRsgA467gLBOgBm,Smells Like Teen Spirit,Nirvana,0.479,0.873,1,-5.896,1,0.0438,1.4e-05,0.000565,0.0974,0.816,116.71,301453
124215,5ghIJDpPoe3CfHMGu71E6T,Smells Like Teen Spirit,Nirvana,0.502,0.912,1,-4.556,1,0.0564,2.5e-05,0.000173,0.106,0.72,116.761,301920
124216,56Cub3qT0VhFQd2Wiblv2w,Smells Like Teen Spirit,Nirvana,0.502,0.912,1,-4.556,1,0.0564,2.5e-05,0.000173,0.106,0.72,116.761,301920
124217,75aLTVBSGIquqzQ6AkmK3Q,Smells Like Teen Spirit,Nirvana,0.502,0.912,1,-4.556,1,0.0564,2.5e-05,0.000173,0.106,0.72,116.761,301920


**Not sure if I'll have time to re-check which track_id is the correct one so this will be left alone for now.**

In [45]:
# saving the full tracks list to csv for safeguarding and the next step of the song recommendation project
#tracks_features.to_csv('final_full_list2.csv', index=False)