In [1]:
import pandas as pd
import numpy as np
import os


Generals

In [2]:
columns_to_keep = ['track_name', 'track_id', 'popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'key', 'mode', 'time_signature']

dataset_path = "../datasets/"

Format random_songs.csv to right structure

In [3]:
random_songs_df = pd.read_csv(f'{dataset_path}random_songs.csv')

# Original mappings
key_conversion = {"C": 0, "C#": 1, "D": 2, "D#": 3, "E": 4, "F": 5, "F#": 6, "G": 7, "G#": 8, "A": 9, "A#": 10, "B": 11}
mode_conversion = {"Major": 1, "Minor": 0}
time_signature_conversion = {"3/4": 3, "4/4": 4, "5/4": 5, "6/4": 6, "7/4": 7}

# Reverse mappings
reverse_key_conversion = {v: k for k, v in key_conversion.items()}
reverse_mode_conversion = {v: k for k, v in mode_conversion.items()}
reverse_time_signature_conversion = {v: k for k, v in time_signature_conversion.items()}

# Apply reverse mappings
random_songs_df['key'] = random_songs_df['key'].map(reverse_key_conversion)
random_songs_df['mode'] = random_songs_df['mode'].map(reverse_mode_conversion)
random_songs_df['time_signature'] = random_songs_df['time_signature'].map(reverse_time_signature_conversion)

# Change id to track_id
random_songs_df.rename(columns={'id': 'track_id'}, inplace=True)
random_songs_df.drop_duplicates(subset=['track_id'], inplace=True)
random_songs_df = random_songs_df[columns_to_keep]

#random_songs_df.to_csv('songs.csv', index=False)

random_songs_df.head()

Unnamed: 0,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode,time_signature
0,Prece a Iansã,5f74voxOH9UIgMmdIB5Sxk,2,0.905,0.907,178013,0.623,0.407,0.276,-10.502,0.177,125.658,0.902,G,Minor,4/4
1,Na Congo To Salakala,1JUtRzUVINB4PWYkiLF1do,0,0.832,0.643,78000,0.101,0.0,0.182,-16.933,0.129,156.946,0.623,C,Major,4/4
2,It Must Be Love,25gtHkSLEjoioXqmoGTK10,0,0.659,0.496,152600,0.618,1.1e-05,0.0711,-7.356,0.0342,78.071,0.96,D#,Major,4/4
3,Mister Boogie (Honky Tonk Train Blues),2BBHhUKbo3VxUJdQxLpyNd,1,0.561,0.866,137578,0.902,2e-06,0.0556,-8.005,0.0722,137.447,0.937,F,Major,4/4
4,Cages,1FiXVMLHXmf3uPrGEzabyk,0,0.0497,0.21,319359,0.452,0.804,0.0953,-17.386,0.0508,177.853,0.342,D,Major,4/4


Format SpotifyFeatures.csv into right structure

In [4]:
len(random_songs_df)

104951

In [5]:
spotify_features_df = pd.read_csv(f'{dataset_path}SpotifyFeatures.csv')

key_conversion = {"C": 0, "C#": 1, "D": 2, "D#": 3, "E": 4, "F": 5, "F#": 6, "G": 7, "G#": 8, "A": 9, "A#": 10, "B": 11}
mode_conversion = {"Major": 1, "Minor": 0}
time_signature_conversion = {"3/4": 3, "4/4": 4, "5/4": 5, "6/4": 6, "7/4": 7}

spotify_features_df['key'] = spotify_features_df['key'].map(key_conversion)
spotify_features_df['mode'] = spotify_features_df['mode'].map(mode_conversion)
spotify_features_df['time_signature'] = spotify_features_df['time_signature'].map(time_signature_conversion)
spotify_features_df.drop_duplicates(subset=['track_id'], inplace=True)
spotify_features_df = spotify_features_df[columns_to_keep]

spotify_features_df.head()

Unnamed: 0,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode,time_signature
0,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,0.346,-1.828,0.0525,166.969,0.814,1,1,4.0
1,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,0.151,-5.559,0.0868,174.003,0.816,6,0,4.0
2,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,0.103,-13.879,0.0362,99.488,0.368,0,0,5.0
3,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,0.0985,-12.178,0.0395,171.758,0.227,1,1,4.0
4,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,0.202,-21.15,0.0456,140.576,0.39,5,1,4.0


In [6]:
len(spotify_features_df)

176774

Format universal_top_spotify_songs.csv into right structure

In [7]:
universal_top_songs = pd.read_csv(f'{dataset_path}universal_top_spotify_songs.csv')

universal_top_songs.rename(columns={'spotify_id':'track_id'}, inplace=True)
universal_top_songs.rename(columns={'name':'track_name'}, inplace=True)
universal_top_songs.rename(columns={'artists':'artist'}, inplace=True)
universal_top_songs.drop_duplicates(subset=['track_id'], inplace=True)
universal_top_songs = universal_top_songs[columns_to_keep]

universal_top_songs.head()

Unnamed: 0,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode,time_signature
0,Espresso,2qSkIjg1o9h3YT9RAgYN75,99,0.107,0.701,175459,0.76,6.5e-05,0.185,-5.478,0.0285,103.969,0.69,0,1,4
1,BIRDS OF A FEATHER,6dOtVTDdiauQNBQEDOtlAB,98,0.2,0.747,210373,0.507,0.0608,0.117,-10.171,0.0358,104.978,0.438,2,1,4
2,Please Please Please,5N3hjp1WNayUPZrA8kJmJP,98,0.274,0.669,186365,0.586,0.0,0.104,-6.073,0.054,107.071,0.579,9,1,4
3,Not Like Us,6AI3ezQ4o3HUoP6Dhudph3,96,0.0107,0.898,274192,0.472,0.0,0.141,-7.001,0.0776,101.061,0.214,1,1,4
4,Gata Only,6XjDF6nds4DE2BBbagZol6,96,0.446,0.791,222000,0.499,2.4e-05,0.0899,-8.472,0.0509,99.986,0.669,8,0,4


In [8]:
len(universal_top_songs)

12845

Check if all dfs have the same columns

In [9]:
# Print columns of each dataframe
print("random_songs_df columns:", random_songs_df.columns)
print("spotify_features_df columns:", spotify_features_df.columns)
print("universal_top_songs columns:", universal_top_songs.columns)

# Check if all dataframes have the same columns
if set(random_songs_df.columns) == set(spotify_features_df.columns) == set(universal_top_songs.columns):
    print("All columns are the same")
else:
    print("Columns are different")
    if set(random_songs_df.columns) != set(spotify_features_df.columns):
        print(f'Differences between random_songs_df and spotify_features_df: {set(random_songs_df.columns) - set(spotify_features_df.columns)}')
        print(f'Differences between spotify_features_df and random_songs_df: {set(spotify_features_df.columns) - set(random_songs_df.columns)}')
    if set(random_songs_df.columns) != set(universal_top_songs.columns):
        print(f'Differences between random_songs_df and universal_top_songs: {set(random_songs_df.columns) - set(universal_top_songs.columns)}')
        print(f'Differences between universal_top_songs and random_songs_df: {set(universal_top_songs.columns) - set(random_songs_df.columns)}')
    if set(spotify_features_df.columns) != set(universal_top_songs.columns):
        print(f'Differences between spotify_features_df and universal_top_songs: {set(spotify_features_df.columns) - set(universal_top_songs.columns)}')
        print(f'Differences between universal_top_songs and spotify_features_df: {set(universal_top_songs.columns) - set(spotify_features_df.columns)}')

random_songs_df columns: Index(['track_name', 'track_id', 'popularity', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'key', 'mode', 'time_signature'],
      dtype='object')
spotify_features_df columns: Index(['track_name', 'track_id', 'popularity', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'key', 'mode', 'time_signature'],
      dtype='object')
universal_top_songs columns: Index(['track_name', 'track_id', 'popularity', 'acousticness', 'danceability',
       'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness',
       'speechiness', 'tempo', 'valence', 'key', 'mode', 'time_signature'],
      dtype='object')
All columns are the same


Concatinate all datasets

In [10]:
all_songs_df = pd.concat([random_songs_df, spotify_features_df, universal_top_songs], ignore_index=True)
all_songs_df.drop_duplicates(subset=['track_id'], inplace=True)
all_songs_df.head()


Unnamed: 0,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,key,mode,time_signature
0,Prece a Iansã,5f74voxOH9UIgMmdIB5Sxk,2,0.905,0.907,178013,0.623,0.407,0.276,-10.502,0.177,125.658,0.902,G,Minor,4/4
1,Na Congo To Salakala,1JUtRzUVINB4PWYkiLF1do,0,0.832,0.643,78000,0.101,0.0,0.182,-16.933,0.129,156.946,0.623,C,Major,4/4
2,It Must Be Love,25gtHkSLEjoioXqmoGTK10,0,0.659,0.496,152600,0.618,1.1e-05,0.0711,-7.356,0.0342,78.071,0.96,D#,Major,4/4
3,Mister Boogie (Honky Tonk Train Blues),2BBHhUKbo3VxUJdQxLpyNd,1,0.561,0.866,137578,0.902,2e-06,0.0556,-8.005,0.0722,137.447,0.937,F,Major,4/4
4,Cages,1FiXVMLHXmf3uPrGEzabyk,0,0.0497,0.21,319359,0.452,0.804,0.0953,-17.386,0.0508,177.853,0.342,D,Major,4/4


In [11]:
len(all_songs_df)

292758

Put in csv

In [12]:
all_songs_df.to_csv(f'{dataset_path}all_songs.csv', index=False)