In [5]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import configparser
import re
import webbrowser

# Read the config.ini file
config = configparser.ConfigParser()
config.read("config.ini")

# Set your Spotify API credentials
CLIENT_ID = config.get("SPOTIFY", "client_id")
CLIENT_SECRET = config.get("SPOTIFY", "client_secret")
REDIRECT_URI = "https://rawcsav.com/callback"
USERNAME = config.get("SPOTIFY", "username")

# Authenticate the user with the required scope
auth_manager = SpotifyOAuth(
    username=USERNAME,
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    redirect_uri=REDIRECT_URI,
    scope='playlist-read-private user-library-read user-follow-read user-follow-modify user-top-read',
    open_browser=True
)

auth_url = auth_manager.get_authorize_url()

if auth_url is not None:
    print("Opening the following URL:")
    print(auth_url)
    webbrowser.open(auth_url)
    print("\nPlease log in to your Spotify account and authorize the application.")
    print("After you've been redirected to the specified redirect URI, please copy the URL and paste it here.")

redirect_url = input("\nPasted redirected URL: ")
code = auth_manager.parse_response_code(redirect_url)
token = auth_manager.get_access_token(code, as_dict=False)

# Initialize Spotify API client with authenticated user
sp = spotipy.Spotify(auth_manager=auth_manager)
OUTPUT_DIR = "/Users/gavinmason/RawC/local"

url = "https://open.spotify.com/playlist/662StmFnG83CfpdygiWmeJ?si=e32f5b2764bd413a"
playlist_id = re.findall(r"playlist\/(.+?)\?", url)[0]

Opening the following URL:
https://accounts.spotify.com/authorize?client_id=7fbce07fe9a84404af10aca418eec95e&response_type=code&redirect_uri=https%3A%2F%2Frawcsav.com%2Fcallback&scope=playlist-read-private+user-library-read+user-follow-read+user-follow-modify+user-top-read

Please log in to your Spotify account and authorize the application.
After you've been redirected to the specified redirect URI, please copy the URL and paste it here.

Pasted redirected URL: https://rawcsav.com/callback?code=AQCdILWEOo7mCHcsKw48mVJJRkQD__kVM5ytf7IiSLR8pc5EJ0PEfYxQlpcubVGdAdWKkZrjo4DMphppwJdiyuEO9AujHJYqS5QuX3gJu0L73W6PHwMapUSt9ofgMgYRPicDcY9As4EUYsSF0i3-xsuBbAQdV2PGaI4Tpq85VnCbNbcd1iF1DFtVw7I2D-w57VqPF2kqDLNI8SClRquf6fkJ9QbWRFPaCp2_v-K68cqFpcvEICG7oJDC5rBtWix4-XRgCgsebtnjAbps7VylWIF-8FfRZgZnLmj0LZgQiA


In [11]:
import os
import youtube_dl

def download_soundcloud(playlist_url, num_tracks=None):
    output_folder = '/Users/gavinmason/RawC/local2'
    os.makedirs(output_folder, exist_ok=True)

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'{output_folder}/%(title)s - %(uploader)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }, {
            'key': 'FFmpegMetadata',
        }, {
            'key': 'EmbedThumbnail',  # This processor embeds the thumbnail into the audio file
        }],
        'prefer_ffmpeg': True,
        'postprocessor_args': ['-vn'],
        'extractaudio': True,
        'writethumbnail': True,  # This option tells youtube_dl to download the thumbnail
    }

    if num_tracks:
        ydl_opts['playlistend'] = num_tracks

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([playlist_url])

if __name__ == '__main__':
    playlist_url = input('Enter the SoundCloud playlist or Likes URL: ')
    num_tracks = input('Enter the number of most recent tracks to download (leave empty for all): ')

    num_tracks = int(num_tracks) if num_tracks else None
    download_soundcloud(playlist_url, num_tracks)


Enter the SoundCloud playlist or Likes URL: https://soundcloud.com/rawcsav/likes
Enter the number of most recent tracks to download (leave empty for all): 25
[soundcloud:user] rawcsav: Downloading user info
[soundcloud:user] 190603800: Downloading track page 1
[soundcloud:user] 190603800: Downloading track page 2
[soundcloud:user] 190603800: Downloading track page 3
[download] Downloading playlist: rawc (Likes)
[soundcloud:user] playlist rawc (Likes): Collected 335 video ids (downloading 25 of them)
[download] Downloading video 1 of 25
[soundcloud] okayybenji/ken-carson-kel-tec-remastered-cdq: Downloading info JSON
[soundcloud] 1498397299: Downloading JSON metadata
[soundcloud] 1498397299: Downloading JSON metadata
[soundcloud] 1498397299: Downloading JSON metadata
[soundcloud] 1498397299: Downloading thumbnail ...
[soundcloud] 1498397299: Writing thumbnail to: /Users/gavinmason/RawC/local2/ken carson - Kel Tec _ Everytime ft. Destroy Lonely @BENJI EXCLUSIVE - benji.jpg
[download] Dest

[soundcloud] tafkaine/fuck-distrokid-prod-bainz: Downloading info JSON
[soundcloud] 1488844168: Downloading JSON metadata
[soundcloud] 1488844168: Downloading JSON metadata
[soundcloud] 1488844168: Downloading JSON metadata
[soundcloud] 1488844168: Downloading thumbnail ...
[soundcloud] 1488844168: Writing thumbnail to: /Users/gavinmason/RawC/local2/Fuck DistroKid (Prod. Bainz) - Duwap Kaine.jpg
[download] Destination: /Users/gavinmason/RawC/local2/Fuck DistroKid (Prod. Bainz) - Duwap Kaine.mp3
[download] 100% of 1.63MiB in 00:0065MiB/s ETA 00:002
[ffmpeg] Post-process file /Users/gavinmason/RawC/local2/Fuck DistroKid (Prod. Bainz) - Duwap Kaine.mp3 exists, skipping
[ffmpeg] Adding metadata to '/Users/gavinmason/RawC/local2/Fuck DistroKid (Prod. Bainz) - Duwap Kaine.mp3'
[ffmpeg] Adding thumbnail to "/Users/gavinmason/RawC/local2/Fuck DistroKid (Prod. Bainz) - Duwap Kaine.mp3"
[download] Downloading video 10 of 25
[soundcloud] merijn-stam/yeat-straight-to-ella-extended: Downloading inf

[soundcloud] 1491060997: Downloading JSON metadata
[soundcloud] 1491060997: Downloading JSON metadata
[soundcloud] 1491060997: Downloading JSON metadata
[soundcloud] 1491060997: Downloading thumbnail ...
[soundcloud] 1491060997: Writing thumbnail to: /Users/gavinmason/RawC/local2/KANYE WEST JESSE  (DONDA 2 LEAK) - LONGMAN361.jpg
[download] Destination: /Users/gavinmason/RawC/local2/KANYE WEST JESSE  (DONDA 2 LEAK) - LONGMAN361.mp3
[download] 100% of 2.08MiB in 00:0106MiB/s ETA 00:003
[ffmpeg] Post-process file /Users/gavinmason/RawC/local2/KANYE WEST JESSE  (DONDA 2 LEAK) - LONGMAN361.mp3 exists, skipping
[ffmpeg] Adding metadata to '/Users/gavinmason/RawC/local2/KANYE WEST JESSE  (DONDA 2 LEAK) - LONGMAN361.mp3'
[ffmpeg] Adding thumbnail to "/Users/gavinmason/RawC/local2/KANYE WEST JESSE  (DONDA 2 LEAK) - LONGMAN361.mp3"
[download] Downloading video 18 of 25
[soundcloud] longman361/kanye-west-we-did-it-kid-ft-william-donda-2-leak: Downloading info JSON
[soundcloud] 1502528983: Downloa

[ffmpeg] Adding metadata to '/Users/gavinmason/RawC/local2/Yeat-Split (with build up intro) best version - 𝕴𝖈𝖊❆.mp3'
[ffmpeg] Adding thumbnail to "/Users/gavinmason/RawC/local2/Yeat-Split (with build up intro) best version - 𝕴𝖈𝖊❆.mp3"
[download] Finished downloading playlist: rawc (Likes)


In [12]:
def get_playlist_tracks(sp, playlist_id):
    tracks = []
    results = sp.playlist_tracks(playlist_id)
    total_tracks = results['total']
    
    tracks.extend(results['items'])
    
    for i in range(100, total_tracks, 100):
        results = sp.playlist_tracks(playlist_id, offset=i)
        tracks.extend(results['items'])
    
    return tracks

def filter_local(tracks):
    return [track for track in tracks if track['track'].get('is_local', False)]

def split_artists(name):
    separators = [' and ', ' feat. ', ' ft. ', ' + ', ' & ', ' - ']

    split_names = [name]
    
    for sep in separators:
        new_split_names = []
        for n in split_names:
            new_split_names.extend(n.split(sep))
        split_names = new_split_names

    return split_names

local_uri = "playlist:3WwAyxz7KAAxvE8XcI7vIo"
local_playlist_id = "3WwAyxz7KAAxvE8XcI7vIo"
new_tracks = get_playlist_tracks(sp, local_playlist_id)
new_local_tracks = filter_local(new_tracks)

track_list = []

for track in new_local_tracks:
    local_track_title = track['track']['name']
    all_artist_names = []

    for artist in track['track']['artists']:
        artist_name = artist['name']
        all_artist_names.extend(split_artists(artist_name))

    local_track_artist = ', '.join(all_artist_names)
    track_list.append({'title': local_track_title, 'artists': local_track_artist})

df_local_tracks = pd.DataFrame(track_list)
df_local_tracks.head()

Unnamed: 0,title,artists
0,Buildings,Yung Lean / Thaiboy Digital / Bladee
1,NEKOBASU,Yung Lean
2,Motorola,Yung Lean
3,3D SPACESHIP,Yung Lean
4,DOUBLE CHECK,Lucki


In [25]:
import pandas as pd

# Load the "local_track_genres.csv" file into a DataFrame
df_existing_tracks = pd.read_csv("local_track_genres.csv")

# Get a list of titles from the csv
existing_titles = df_existing_tracks['title'].tolist()

track_artist_mapping = {}  # Initialize an empty dictionary to store user input

# Loop through df_local_tracks and ask the user for input if title does not exist in the csv
for index, row in df_local_tracks.iterrows():
    if row['title'] not in existing_titles:
        print(f"Title: {row['title']} - Current Artist: {row['artists']}")
        new_artist = input("Press Enter to keep the current artist or type a modified artist name: ")
        
        # If user provided a new artist name, update the DataFrame
        if new_artist:
            df_local_tracks.at[index, 'artists'] = new_artist
            track_artist_mapping[row['title']] = new_artist
        else:
            track_artist_mapping[row['title']] = row['artists']

# Filter df_local_tracks to keep only tracks that did not appear in the csv
df_local_tracks = df_local_tracks[df_local_tracks['title'].apply(lambda x: x not in existing_titles)]

# Display the track_artist_mapping
print(track_artist_mapping)

# Display the final DataFrame
df_local_tracks.head()

Title: ken carson - Kel Tec / Everytime ft. Destroy Lonely @BENJI EXCLUSIVE - Current Artist: Ken Carson, Destroy Lonely
Press Enter to keep the current artist or type a modified artist name: 
Title: 50 (produced by malenkiyyarche) - Current Artist: Duwap Kaine
Press Enter to keep the current artist or type a modified artist name: 
Title: hardrock - bleh [prod. saint] - Current Artist: Hardrock
Press Enter to keep the current artist or type a modified artist name: 
Title: KANYE WEST JESSE  (DONDA 2 LEAK) - Current Artist: Kanye West
Press Enter to keep the current artist or type a modified artist name: 
Title: Travis Scott - Lost Forever (feat. Westside Gunn) [leaked] - Current Artist: Travis Scott
Press Enter to keep the current artist or type a modified artist name: 
Title: Kanye West Never Forgive Yourself / Yeah I Know - Current Artist: Kanye West
Press Enter to keep the current artist or type a modified artist name: 
Title: destroy lonely - met gala - Current Artist: Destroy Lonel

Unnamed: 0,title,artists
280,ken carson - Kel Tec / Everytime ft. Destroy L...,"Ken Carson, Destroy Lonely"
281,50 (produced by malenkiyyarche),Duwap Kaine
282,hardrock - bleh [prod. saint],Hardrock
283,KANYE WEST JESSE (DONDA 2 LEAK),Kanye West
284,Travis Scott - Lost Forever (feat. Westside Gu...,Travis Scott


In [27]:
# Display the final DataFrame
df_local_tracks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, 280 to 304
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    25 non-null     object
 1   artists  25 non-null     object
dtypes: object(2)
memory usage: 600.0+ bytes


In [35]:
import pandas as pd
import pickle

with open('artist_genre_database.pickle', 'rb') as handle:
    artist_genre_database = pickle.load(handle)

def get_spotify_genres(artist_name):
    search_result = sp.search(q=f"artist:{artist_name}", type="artist", limit=5)
    
    if search_result and search_result['artists']['total'] > 0:
        print(f"\nTop 5 search results for artist: {artist_name}")
        for i, artist in enumerate(search_result['artists']['items']):
            print(f"{i+1}. {artist['name']}")

        selected_artist = int(input("Enter the number of the correct artist, or press Enter if none are correct: ") or 0)

        if 0 < selected_artist <= 5:
            chosen_artist = search_result['artists']['items'][selected_artist - 1]
            if chosen_artist.get('genres'):
                print(f"Genres for artist {chosen_artist['name']}: {', '.join(chosen_artist['genres'])}")
                return ', '.join(chosen_artist['genres'])

    print(f"No genres found for artist: {artist_name}")
    return ""


# Iterate the df_local_tracks DataFrame
for index, row in df_local_tracks.iterrows():
    genres_combined = []
    artists = row['artists'].split(',')

    for artist in artists:
        artist_name = artist.strip()

        if artist_name in artist_genre_database:
            genres = artist_genre_database[artist_name]
        else:
            genres = get_spotify_genres(artist_name)
            artist_genre_database[artist_name] = genres

        if genres:
            # Combine genres
            genres_combined.extend([g.strip() for g in genres.split(',')])

    # Add comma-separated string of unique genres to the genres column
    df_local_tracks.at[index, 'genres'] = ', '.join(set(genres_combined))

# Display the final DataFrame
df_local_tracks.head()


Top 5 search results for artist: Ken Carson
1. Ken Carson
2. Ken Carson
3. Ken Carson
4. Carson Kendrick
5. Kendel Carson
Enter the number of the correct artist, or press Enter if none are correct: 1
Genres for artist Ken Carson: rage rap

Top 5 search results for artist: Destroy Lonely
1. Destroy Lonely
2. Destroyed Lonely
3. Destroy Lonely
4. Destroying Lonely
5. Destroy Lonely
Enter the number of the correct artist, or press Enter if none are correct: 1
Genres for artist Destroy Lonely: rage rap

Top 5 search results for artist: Duwap Kaine
1. Duwap Kaine
2. Duwap Kaine
3. Duwap Kaine
4. Duwap Kaine
5. Duwap Kaine
Enter the number of the correct artist, or press Enter if none are correct: 1
Genres for artist Duwap Kaine: dark plugg, pluggnb, vapor trap

Top 5 search results for artist: Hardrock
1. Hardrock
2. HardRock
3. Hardrock
4. Hardrock
5. Hardrockgutta
Enter the number of the correct artist, or press Enter if none are correct: 1
Genres for artist Hardrock: rage rap

Top 5 sea

Unnamed: 0,title,artists,genres
280,ken carson - Kel Tec / Everytime ft. Destroy L...,"Ken Carson, Destroy Lonely",rage rap
281,50 (produced by malenkiyyarche),Duwap Kaine,"dark plugg, pluggnb, vapor trap"
282,hardrock - bleh [prod. saint],Hardrock,rage rap
283,KANYE WEST JESSE (DONDA 2 LEAK),Kanye West,"rap, chicago rap, hip hop"
284,Travis Scott - Lost Forever (feat. Westside Gu...,Travis Scott,"rap, slap house, hip hop"


In [37]:
with open('artist_genre_database.pickle', 'wb') as handle:
    pickle.dump(artist_genre_database, handle, protocol=pickle.HIGHEST_PROTOCOL)

existing_data = pd.read_csv("local_track_genres.csv")

combined_data = pd.concat([existing_data, df_local_tracks], ignore_index=True, sort=False)

combined_data.to_csv("local_track_genres.csv", index=False)

In [4]:
merged_df = pd.read_csv("spect_merged.csv")

sampling_fraction = 0.05

# 3. Perform stratified sampling
_, sample_set = train_test_split(merged_df, test_size=sampling_fraction, stratify=merged_df['GENRE'], random_state=42)

print("Shape of merged_df:", merged_df.shape)
print("Shape of sample_set:", sample_set.shape)

Shape of merged_df: (558202, 16)
Shape of sample_set: (27911, 16)


In [7]:
sample_set.to_csv("sample_feats.csv", index=False)
sample_set.head()rate_limiter = RateLimiter(max_calls=30, period=1)


def batch_request_preview_url(track_ids):
    tracks_info = sp.tracks(track_ids)
    preview_urls = {}
    for track_info in tracks_info.get('tracks', []):
        if 'id' in track_info and 'preview_url' in track_info:
            track_id = track_info['id']
            preview_url = track_info['preview_url']
            if preview_url:
                preview_urls[track_id] = preview_url
    return preview_urls

def get_preview_urls(track_ids, batch_size=50, max_workers=6):
    preview_urls = {}
    num_batches = int(len(track_ids) / batch_size) + (1 if len(track_ids) % batch_size != 0 else 0)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []

        with tqdm(total=num_batches, desc="Fetching preview URLs") as pbar:
            for i in range(0, len(track_ids), batch_size):
                batch = track_ids[i:i + batch_size]
                with rate_limiter:
                    futures.append(executor.submit(batch_request_preview_url, batch))

            for future in concurrent.futures.as_completed(futures):
                preview_urls.update(future.result())
                pbar.update(1)

    return preview_urls

track_ids = sample_set.track_id.tolist()
preview_urls = get_preview_urls(track_ids)

sample_set['preview_url'] = sample_set['track_id'].map(preview_urls)
sample_set = sample_set[sample_set['preview_url'].notna()]

# Convert the filtered DataFrame to a list of dictionaries
song_list = sample_set.to_dict(orient='records')

Fetching preview URLs: 100%|██████████████████| 559/559 [00:21<00:00, 25.74it/s]


In [8]:
sample_set.to_csv("sample_feats.csv", index=False)
sample_set.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,playlist_url,GENRE,preview_url
0,1FM9YzxKP7zLL2QAFAvgIU,0.803,0.838,1,-8.323,1,0.0569,0.0183,0.943,0.0949,0.191,124.004,496035,4,https://open.spotify.com/playlist/3bP1vgmYyNso...,latin tech house,https://p.scdn.co/mp3-preview/17c89a8af3b9579f...
1,5Dor3YR48c0O2wwttqqbs6,0.255,0.951,7,-8.296,0,0.0875,0.00194,0.901,0.673,0.109,138.012,187826,4,https://open.spotify.com/playlist/2BPocNoK7QF8...,dutch trance,https://p.scdn.co/mp3-preview/b262e94f4ed95b0f...
3,4Mmpp07tjOmp9NXFgCOw0L,0.141,0.00307,6,-29.915,1,0.0497,0.989,0.728,0.071,0.0452,74.408,503573,5,https://open.spotify.com/playlist/7v6C5Eq5kz3Y...,shakuhachi,https://p.scdn.co/mp3-preview/64097b410ce47d77...
4,2HsH7qEixVeySTOpV4bEBb,0.374,0.883,1,-5.21,0,0.0367,0.000883,0.0,0.165,0.885,100.27,122805,4,https://open.spotify.com/playlist/3JICuz8Q3T8m...,finnish punk,https://p.scdn.co/mp3-preview/312031d86bd5decc...
5,3g0YZStxYmSxRKJjB7myaA,0.595,0.656,7,-7.061,1,0.15,0.162,0.0,0.379,0.443,140.07,188732,4,https://open.spotify.com/playlist/5OFsiAHgwlYH...,pop edm,https://p.scdn.co/mp3-preview/028f4f7514ea023f...


In [None]:
import os
import asyncio
import random
import nest_asyncio
import pandas as pd
import requests

nest_asyncio.apply()

async def download_preview(url, destination):
    max_attempts = 5
    attempts = 0
    while attempts < max_attempts:
        try:
            response = requests.get(url)
            if response.status_code == 429:
                raise Exception("Rate limit exceeded")

            with open(destination, "wb") as file:
                file.write(response.content)
            break
        except Exception as e:
            attempts += 1
            await asyncio.sleep(random.uniform(1, 2) * attempts)
            print(f"Attempt {attempts}: {e}")


async def download_previews(df, audio_folder):
    os.makedirs(audio_folder, exist_ok=True)
    tasks = []

    for _, row in df.iterrows():
        track_id, preview_url = row['track_id'], row['preview_url']
        temp_audio_path = os.path.join(audio_folder, f'{track_id}.mp3')

        if not os.path.exists(temp_audio_path):
            tasks.append(download_preview(preview_url, temp_audio_path))

    await asyncio.gather(*tasks)

# Your DataFrame with track_id and preview_url columns

# Set the "audio_folder" variable to the path where you want the MP3 files to be saved
audio_folder = "Sample_MP3"

asyncio.run(download_previews(sample_set, audio_folder))

In [None]:
import pathlib
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from PIL import Image

def input_files_iter(input_folder_path, output_folder_path):
    existing_output_files = {f.stem: f for f in output_folder_path.glob('**/*.png')}

    for input_file in input_folder_path.glob('**/*.mp3'):
        output_path = output_folder_path / f'{input_file.stem}.png'
        if input_file.stem in existing_output_files:
            continue
        else:
            yield str(input_file)

def process_audio(y, sr, output_path):
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

    plt.figure()
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.axis('off')
    plt.subplots_adjust(top=1, bottom=0, left=0, right=1, hspace=0, wspace=0)
    plt.margins(0, 0)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())

    plt.savefig("temp.png", dpi=75, bbox_inches='tight', pad_inches=0, format='png', facecolor='w', edgecolor='w')
    plt.close()

    im = Image.open("temp.png")
    im.save(output_path)

def load_audio_chunk(audio_file):
    y, sr = librosa.load(audio_file)
    return y, sr

async def process_mp3_to_spectrograms(input_folder, output_folder):
    input_folder_path = pathlib.Path(input_folder)
    output_folder_path = pathlib.Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)

    input_files_gen = input_files_iter(input_folder_path, output_folder_path)

    tasks = []
    for audio_file in input_files_gen:
        output_path = str(output_folder_path / f'{pathlib.Path(audio_file).stem}.png')
        tasks.append((audio_file, output_path))

    with ThreadPoolExecutor() as executor:
        loop = asyncio.get_event_loop()
        await asyncio.gather(*[loop.run_in_executor(executor, process_audio_chunk, *task) for task in tasks])

def process_audio_chunk(audio_file, output_path):
    y, sr = load_audio_chunk(audio_file)
    process_audio(y, sr, output_path)

if __name__ == "__main__":
    input_folder = "Sample_MP3"
    output_folder = "Sample_Spectrograms"
    asyncio.run(process_mp3_to_spectrograms(input_folder, output_folder))

In [4]:
import os
import matplotlib.pyplot as plt
import librosa
import numpy as np
from pathlib import Path

def process_audio(audio_path, output_folder):
    # Load audio file to retrieve duration
    y, sr = librosa.load(audio_path, sr=22050, mono=True)
    duration = librosa.get_duration(y=y, sr=sr)

    # Calculate the offset for the middle 30 seconds of the song
    if duration > 30:
        offset = (duration - 30) / 2
    else:
        offset = 0

    # Load audio with the calculated offset and duration limit
    y, sr = librosa.load(audio_path, offset=offset, sr=22050, mono=True, duration=30.0)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)

    # Save the spectrogram as an image
    output_path = os.path.join(output_folder, os.path.basename(audio_path).replace('.mp3', '.png'))

    plt.figure()
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.axis('off')
    plt.savefig(output_path, dpi=75, bbox_inches='tight', pad_inches=0, format='png', facecolor='w', edgecolor='w')
    plt.close()

    print(f"Generated image: {output_path}")

def process_files_in_folder(input_folder, output_folder_name):
    # Get the current script directory (Jupyter Notebook version) and create the output folder
    script_dir = os.getcwd()
    output_folder = os.path.join(script_dir, output_folder_name)
    os.makedirs(output_folder, exist_ok=True)

    # Loop through the MP3 files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.endswith(".mp3"):
            mp3_file = os.path.join(input_folder, file_name)
            spectrogram_file = os.path.join(output_folder, file_name.replace('.mp3', '.png'))

            # Check if the corresponding spectrogram image exists in the output folder
            if not Path(spectrogram_file).exists():
                process_audio(mp3_file, output_folder)
            else:
                print(f"Skipping {mp3_file} as the spectrogram image already exists.")

input_folder = "/Users/gavinmason/RawC/local/"
output_folder = "Local_Spectrograms"

process_files_in_folder(input_folder, output_folder)

Skipping /Users/gavinmason/RawC/local/ Autumn & Summrs - Cliche (prod. luvniko x mike frost).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Security.mp3 as the spectrogram image already exists.
Generated image: /Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms/ken carson - Kel Tec _ Everytime ft. Destroy Lonely @BENJI EXCLUSIVE - benji.png
Skipping /Users/gavinmason/RawC/local/ Look What I Become (prod. Wifi).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Desire.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ it might be me! [ Prod By. QuaXar ].mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Locked & Loaded.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ GRAVE PLOT.mp3 as the spectrogram image already exists.
Generated image: /Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms/Travis Scott - Los

Generated image: /Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms/Kendrick Lamar - Bitchface - Flacko3.png
Skipping /Users/gavinmason/RawC/local/ Shawty In Love.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Lose my mind (prod. Maajins).mp3 as the spectrogram image already exists.
Generated image: /Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms/mall grab x denzel curry x maxo kream - Dwells.png
Skipping /Users/gavinmason/RawC/local/ You Can Feel (Kering).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Kankan - hat4hat.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ eye 2 eye (certified kaine).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Marry This Game (Prod. Highway & Jonny).mp3 as the spectrogram image already exists.
Generated image: /Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms/OTL Beezy - Hibachi [Prod - F1L

Generated image: /Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms/supervillian cash carti - Cole Sterling.png
Skipping /Users/gavinmason/RawC/local/ Fuck The Streets.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ We On It (feat. Dboylo).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Walk In.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ SSG Kobe - Go Hard.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ BitchCallMeCaptainMorgan.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ faded.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Darkest Before Dawn prod. potentgyft.mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ lil peep - suck my blood (prod lederrick).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/

Generated image: /Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms/hardrock - bleh [prod. saint] - Hitechstef.png
Skipping /Users/gavinmason/RawC/local/ WingRiddenAngel (Prod. Kellbender).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ WALKED IN (PROD. SAURON).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Crash Out (prod Wonder X  Codylemont).mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Summrs! - 1017 ALYX (prod. Mingo)[YP4AM Exclusive].mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ buku bandz ft Sgpwes [prod.pinkgrillz].mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ Sl#t [prod rision + cubox].mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/local/ R.I.P Virgil ft dc the don [prod bhristo].mp3 as the spectrogram image already exists.
Skipping /Users/gavinmason/RawC/lo

In [None]:
sample_set=pd.read_csv("sample_feats.csv")
sample_set.info()
input_folder = "/Users/gavinmason/RawC/local/"
output_folder = "Local_Spectrograms"
audio_features_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']          

In [3]:
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from itertools import repeat
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split, KFold
from torchvision.transforms import transforms
from PIL import Image
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR
from torch.cuda.amp import GradScaler, autocast
from google.cloud import storage
from google.cloud.exceptions import NotFound, GoogleCloudError
from io import BytesIO, StringIO
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.Grayscale(),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

def load_spectrogram(track_id, spectrogram_bucket, spectrogram_folder_path):
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket(spectrogram_bucket)
        blob = storage.Blob(f"{spectrogram_folder_path}/{track_id}.png", bucket)
        blob_bytes = blob.download_as_bytes()

        img_data = BytesIO(blob_bytes)
        img = Image.open(img_data)

        return transform(img)
    except NotFound:
        return None
    except GoogleCloudError as e:
        print(f"Error loading spectrogram for track_id {track_id}: {str(e)}")
        return None

def load_data(spotify_data_file, spectrogram_bucket, spectrogram_folder_path):
    try:
        print("Loading data...")
        storage_client = storage.Client()
        bucket = storage_client.bucket(spectrogram_bucket)
        blob = storage.Blob(f"{spotify_data_file}", bucket)
        spotify_data_string = blob.download_as_text()
        spotify_df = pd.read_csv(StringIO(spotify_data_string))

        audio_features_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
                                  'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']

        print("Loading spectrograms...")
        with ThreadPoolExecutor() as executor:
            spectrograms = list(tqdm(executor.map(
                    load_spectrogram,
                    spotify_df["track_id"],
                    repeat(spectrogram_bucket),
                    repeat(spectrogram_folder_path)
                ),
                total=len(spotify_df["track_id"]),
                desc="Loading spectrograms"
            ))

        spectrograms = [spec for spec in spectrograms if spec is not None]

        track_ids_spectrogram_map = {track_id: spec for track_id, spec in zip(spotify_df["track_id"], spectrograms)}

        filtered_df = spotify_df[spotify_df["track_id"].isin(track_ids_spectrogram_map.keys())]
        audio_features = filtered_df[audio_features_columns].to_numpy()
        remaining_spectrograms = [track_ids_spectrogram_map[track_id] for track_id in filtered_df["track_id"]]

        return remaining_spectrograms, audio_features
    except GoogleCloudError as e:
        print(f"Error loading data: {str(e)}")
        return None, None
    except Exception as e:
        print(f"Unexpected error loading data: {str(e)}")
        return None, None

class DeeperCNNModel(nn.Module):
    def __init__(self, num_features):
        super(DeeperCNNModel, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(32)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(64)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(128)
        )
        self.fc = nn.Sequential(
            nn.Linear(128 * 16 * 16, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_features)
        )

    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

def train_model(train_loader, val_loader, num_epochs=10, learning_rate=0.001):
    num_features = train_loader.dataset.tensors[1].shape[1] 
    model = DeeperCNNModel(num_features).to(device)
    
    if torch.cuda.device_count() > 1:
        print("Using", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
    
    scaler = GradScaler()

    best_val_loss = float("inf")
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0
        for inputs, targets in tqdm(train_loader, desc=f'Epoch {epoch+1} Training'):
            inputs, targets = inputs.to(device), targets.to(device)  # Move to GPU here
            
            optimizer.zero_grad()

            # AMP
            with autocast():
                outputs = model(inputs)
                loss = criterion(outputs, targets)
            
            # Gradient Accumulation
            scaler.scale(loss).backward()
            
            # Updating model parameters
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += loss.item()

        running_loss /= len(train_loader)

        model.eval()
        val_loss = 0

        with torch.no_grad():
            for inputs, targets in tqdm(val_loader, desc=f'Epoch {epoch+1} Validation'):
                inputs, targets = inputs.to(device), targets.to(device)  # Move to GPU here
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        scheduler.step()
        print(f'Epoch {epoch + 1}, Training Loss: {running_loss}, Validation Loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_weights = model.state_dict()

    model.load_state_dict(best_model_weights)
    return model

def save_model_to_bucket(model, save_path, bucket_name):
    try:
        storage_client = storage.Client()
        bucket = storage_client.bucket(bucket_name)

        blob = storage.Blob(save_path, bucket)
        buffer = BytesIO()
        torch.save(model.state_dict(), buffer)
        blob.upload_from_file(buffer, rewind=True)
    except GoogleCloudError as e:
        print(f"Error saving model to bucket: {str(e)}")
    except Exception as e:
        print(f"Unexpected error saving model to bucket: {str(e)}")

spectrogram_bucket = "spectrogram-botify"
sample_set_file_path = "sample_feats.csv"
spectrogram_folder_path = "Sample_Spectrogram"

spectrograms, audio_features = load_data(sample_set_file_path, spectrogram_bucket, spectrogram_folder_path)


Loading spectrograms: 100%|██████████████| 25160/25160 [04:05<00:00, 102.28it/s]


In [None]:

if spectrograms is not None and audio_features is not None:
    spectrograms_tensor = torch.stack(spectrograms).float()  # Convert to float32
    audio_features_tensor = torch.from_numpy(audio_features).float()  # Convert to float32


    train_inputs, val_inputs, train_targets, val_targets = train_test_split(spectrograms_tensor, audio_features_tensor,
                                                                            test_size=0.7, random_state=42)

    train_dataset = TensorDataset(train_inputs, train_targets)
    val_dataset = TensorDataset(val_inputs, val_targets)

    # Using num_workers for faster data loading
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=os.cpu_count())
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=os.cpu_count())

    model = train_model(train_loader, val_loader, num_epochs=10)

    save_model_to_bucket(model, "model_weights.pth", spectrogram_bucket)
else:
    print("Unable to load data or spectrograms.")


In [14]:
import os
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
import pandas as pd 
from tqdm import tqdm
from fuzzywuzzy import process

class DeeperCNNModel(nn.Module):
    def __init__(self, num_features):
        super(DeeperCNNModel, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(32)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(64)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(128)
        )
        self.fc = nn.Sequential(
            nn.Linear(128 * 16 * 16, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_features)
        )

    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = self.conv3(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

def load_model(model_path, num_features):
    model = DeeperCNNModel(num_features)
    if torch.cuda.is_available():
        model.load_state_dict(torch.load(model_path))
        model = model.to('cuda')
    else:
        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()
    return model

def transform_spectrogram(img_path):
    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.Grayscale(),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    with Image.open(img_path) as img:
        transformed_spectrogram = transform(img)
    return transformed_spectrogram

def predict(model, spectrogram_batch):
    if torch.cuda.is_available():
        spectrogram_batch = spectrogram_batch.to('cuda')
    with torch.no_grad():
        predictions = model(spectrogram_batch)
    return predictions

def append_predictions(model, spectrogram_dir, existing_data):
    audio_features_columns = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                              'acousticness', 'instrumentalness', 'liveness', 'valence', 
                              'tempo', 'time_signature']

    spectrogram_files = os.listdir(spectrogram_dir)

    for filename in tqdm(spectrogram_files):  
        if filename.endswith(".png"):
            img_path = os.path.join(spectrogram_dir, filename)
            transformed_spectrogram = transform_spectrogram(img_path)
            spectrogram_batch = transformed_spectrogram.unsqueeze(0)
            prediction = predict(model, spectrogram_batch)
            prediction = prediction.cpu().numpy()

            track_title = os.path.splitext(filename)[0]
            best_match, score = process.extractOne(track_title, existing_data['title'].tolist())
            
            if score > 80:  # Set a threshold for the matching score
                existing_data.loc[existing_data['title'] == best_match, audio_features_columns] = prediction
                
    return existing_data

# Define the path of your model weights and the number of features
model_path = "model_weights.pth"
num_features = 12

# Load your model
model = load_model(model_path, num_features)

existing_data = pd.read_csv("local_track_genres.csv")

data_folder = "/Users/gavinmason/RawC/Jupyter/Botify/Local_Spectrograms"

updated_data = append_predictions(model, data_folder, existing_data)

# Save the updated data to a new CSV file
updated_data.to_csv('updated_data.csv', index=False)

100%|█████████████████████████████████████████| 322/322 [00:13<00:00, 23.98it/s]


In [15]:
updated_data.head(25)

Unnamed: 0,title,artists,genres,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,Buildings,"yung lean, thaiboy digital, bladee, yung lean ...","glitchcore, psychedelic hip hop, underground h...",0.480036,0.308884,3.787433,-6.274174,0.360193,0.159972,0.123602,0.20643,0.052815,0.291859,87.16256,2.690958
1,NEKOBASU,yung lean,"cloud rap, psychedelic hip hop, underground hi...",0.573944,0.385504,4.534843,-7.543097,0.410341,0.20248,0.176231,0.2479,0.052234,0.350456,105.259933,3.246399
2,Motorola,yung lean,"cloud rap, psychedelic hip hop, underground hi...",0.573248,0.41117,4.619379,-7.40883,0.405689,0.197945,0.13901,0.240663,0.078189,0.310773,107.090019,3.326613
3,3D SPACESHIP,yung lean,"cloud rap, psychedelic hip hop, underground hi...",0.51762,0.30888,3.984193,-6.530579,0.397025,0.161591,0.148838,0.225835,0.063704,0.302073,91.642746,2.786531
4,DOUBLE CHECK,lucki,plugg,0.494345,0.338424,4.044555,-6.424217,0.350097,0.195185,0.143346,0.206431,0.064089,0.30257,91.551689,2.88341
5,Syrup Talk,lucki,plugg,0.495864,0.306836,3.848587,-6.423122,0.350561,0.168886,0.154568,0.231655,0.044021,0.307697,89.88929,2.719275
6,Roses,yung lean,"cloud rap, psychedelic hip hop, underground hi...",0.513047,0.363173,4.271848,-7.122589,0.425922,0.184886,0.128356,0.239957,0.075392,0.292424,98.486748,3.062793
7,Vendetta,yung lean,"cloud rap, psychedelic hip hop, underground hi...",0.494066,0.353564,3.909262,-5.994223,0.325604,0.178397,0.136184,0.215108,0.058645,0.291894,90.1642,2.742205
8,SWING THE SCYTHE,buckshot,['cloud rap'],0.450368,0.314717,3.677889,-6.116886,0.409005,0.144012,0.096468,0.209424,0.068561,0.263225,84.357559,2.654478
9,EpaR Featuring Vince Staples,"earl sweatshirt, vince staples","conscious hip hop, hip hop, rap, underground h...",0.505988,0.32219,4.272994,-6.992233,0.468449,0.168803,0.142506,0.232902,0.077919,0.302934,96.002884,3.043925


In [18]:
import pandas as pd 
train_audio_features = pd.read_csv("sample_feats.csv")
train_audio_features.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
count,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0,25160.0
mean,0.547117,0.619098,5.320707,-9.222752,0.614467,0.087913,0.335482,0.240403,0.188655,0.478414,121.681221,249327.7,3.883108
std,0.192524,0.263442,3.547788,5.508935,0.486731,0.100195,0.349267,0.360415,0.15346,0.276309,29.885765,121375.1,0.453893
min,0.0,2e-05,0.0,-51.282,0.0,0.0,0.0,0.0,0.00892,0.0,0.0,30000.0,0.0
25%,0.415,0.442,2.0,-11.13025,0.0,0.0369,0.0118,1e-06,0.0955,0.243,98.11375,180792.8,4.0
50%,0.562,0.656,5.0,-7.858,1.0,0.0505,0.19,0.00109,0.125,0.468,120.254,222827.0,4.0
75%,0.695,0.842,8.0,-5.677,1.0,0.092525,0.644,0.55925,0.238,0.711,140.024,283751.0,4.0
max,0.985,1.0,11.0,4.14,1.0,0.965,0.996,1.0,0.991,0.998,219.929,3600000.0,5.0


In [23]:
predicted_audio_features = pd.read_csv("updated_data.csv")
predicted_audio_features.dropna(subset=features, inplace=True)
predicted_audio_features.describe()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0,293.0
mean,0.530475,0.358738,4.233792,-6.854553,0.387561,0.18228,0.134482,0.223095,0.047198,0.313642,97.676743,3.002582
std,0.046286,0.044354,0.376312,0.680845,0.055103,0.02237,0.029093,0.023702,0.020433,0.031934,8.845578,0.281103
min,0.350982,0.205882,2.838337,-8.643972,0.229074,0.107104,0.032304,0.148107,0.005138,0.199424,64.501083,1.994905
25%,0.502637,0.328161,4.002285,-7.329586,0.351033,0.168473,0.113663,0.209838,0.030888,0.292943,92.731873,2.828175
50%,0.532744,0.356509,4.259284,-6.895003,0.386523,0.180218,0.133478,0.223561,0.046795,0.31309,98.240303,3.026268
75%,0.561054,0.391179,4.502435,-6.474004,0.423133,0.195185,0.155067,0.239644,0.061275,0.337081,103.653816,3.199687
max,0.666774,0.503202,5.320601,-4.450639,0.570381,0.245233,0.208369,0.294847,0.101439,0.385332,123.556786,3.746193


In [25]:
from scipy.stats import skew, kurtosis

predicted_audio_features.dropna(subset=features)

features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                              'acousticness', 'instrumentalness', 'liveness', 'valence', 
                              'tempo', 'time_signature']


for feature in features:
    train_skew = skew(train_audio_features[feature])
    pred_skew = skew(predicted_audio_features[feature])
    train_kurt = kurtosis(train_audio_features[feature])
    pred_kurt = kurtosis(predicted_audio_features[feature])

    print(f"Feature: {feature}\n")
    print(f"Train skew: {train_skew}, Train kurtosis: {train_kurt}")
    print(f"Pred skew: {pred_skew}, Pred kurtosis: {pred_kurt}\n")
    print("\n------------------------\n")

Feature: danceability

Train skew: -0.2999731243348028, Train kurtosis: -0.5866678377143373
Pred skew: -0.4819718322973589, Pred kurtosis: 0.6104627875034843


------------------------

Feature: energy

Train skew: -0.5177195278893917, Train kurtosis: -0.6634022972932936
Pred skew: 0.009038418986178756, Pred kurtosis: 0.1587306116565812


------------------------

Feature: key

Train skew: -0.005292039851210374, Train kurtosis: -1.2855615171146424
Pred skew: -0.4377227528199734, Pred kurtosis: 0.5307110077572426


------------------------

Feature: loudness

Train skew: -1.8607475935469662, Train kurtosis: 4.861437857398033
Pred skew: 0.5218971534869467, Pred kurtosis: 0.5658487092956324


------------------------

Feature: mode

Train skew: -0.47036161352590716, Train kurtosis: -1.7787599525213047
Pred skew: 0.07301500943184512, Pred kurtosis: 0.15615864081312703


------------------------

Feature: speechiness

Train skew: 4.2517130857620895, Train kurtosis: 27.05968396591946
Pred sk

In [26]:
from scipy.stats import ks_2samp

for feature in features:
    statistic, pvalue = ks_2samp(train_audio_features[feature], predicted_audio_features[feature])
    print(f"Feature: {feature}\n")
    print(f"KS statistic: {statistic}, p-value: {pvalue}")
    print("\n------------------------\n")


Feature: danceability

KS statistic: 0.3876148282392008, p-value: 9.837739222947687e-40

------------------------

Feature: energy

KS statistic: 0.7336800110690895, p-value: 3.253426507549276e-160

------------------------

Feature: key

KS statistic: 0.5655162590818081, p-value: 4.372580068968264e-88

------------------------

Feature: loudness

KS statistic: 0.4621982452237421, p-value: 3.1339811600042675e-57

------------------------

Feature: mode

KS statistic: 0.6144674085850557, p-value: 7.634988206074349e-106

------------------------

Feature: speechiness

KS statistic: 0.8238282500529037, p-value: 2.1208548792935483e-217

------------------------

Feature: acousticness

KS statistic: 0.48796222944486345, p-value: 3.675313177255614e-64

------------------------

Feature: instrumentalness

KS statistic: 0.6743640699523052, p-value: 6.555275370637665e-131

------------------------

Feature: liveness

KS statistic: 0.8096714813588934, p-value: 5.0021571801814936e-207

----------