In [None]:
import sys
import os
import spotipy
import pandas as pd
from spotipy.oauth2 import SpotifyClientCredentials
import numpy as np
from src.preprocessing import get_features
import json
import seaborn as sns
import matplotlib.pyplot as plt
from src.random_generator_w_artists import get_random_df

In [None]:
# Before running this code, a new token needs to be generated in src/random_generator_w_artists.py

CLIENT_ID = "" #DEFINE YOUR SPOTIPY CREDENTIALS
CLIENT_SECRET = "" #DEFINE YOUR SPOTIPY CREDENTIALS
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET))

In [None]:
all_dfs_as_list = []
directory = os.fsencode('./playlist_data')
    
for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if filename.endswith(".json"): 

        playlist = json.load(open('./playlist_data/' + filename))
        playlist_id = playlist['id']
        playlist_name = playlist['name']
        df_playlist = playlist['tracks']
        df_tracks = df_playlist['items']

        track_ids = []
        artists = []
        featurings = []
        for item in df_tracks:
            if item['track'] is not None: 
                trackID = item['track']['id']
                track_ids.append(trackID)
                main_artist = item["track"]['artists'][0]['name']
                artists.append(main_artist)
                number_of_features = len(item["track"]["artists"])
                featurings.append(number_of_features)
                
        temp_df = pd.DataFrame({'PlaylistID': playlist_id, 'PlaylistTitle': playlist_name, 'TrackID': track_ids, "MainArtist": artists, "NoFeaturing": featurings, 'Features': ''})
        all_dfs_as_list.append(temp_df)
random_df = get_random_df()
print('Size of random DF', random_df.shape)
all_dfs_as_list.append(random_df)
pd.set_option("display.max_rows", None, "display.max_columns", None) # Displays the entire dataframe in nootebook, not just a preview
df = pd.concat(all_dfs_as_list)
print('Before duplicates removed', df.shape)
df.drop_duplicates(subset='TrackID', keep="first", inplace=True)
print('After duplicates removed', df.shape)

In [None]:
## Add musical features to each song
i = 0

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET))
for track in df['TrackID']:
    track_ft = 'spotify:audio-features:{}'.format(track)
    track_features = sp.audio_features(track_ft)[0]
    df["Features"].iloc[i] = np.array(track_features)
    i += 1


In [None]:
## Split each feature into a seperate column
df[['danceability','energy', 'key', 'loudness', 'mode', 'speechiness','acousticness',
    'instrumentalness','liveness','valence', 'tempo', 'type', 'id', 'uri', 
    'track_href','analysis_url', 'duration_ms','time_signature']] = df.apply(lambda x: get_features(row = x), axis=1)

In [None]:
#Dropping redundant columns...
df = df.drop(columns=['type', 'Features'])

In [None]:
## Labels added from label_data
from os import listdir
from os.path import isfile, join

df["Artist"] = df["MainArtist"].str.lower()
pd.set_option("display.max_rows", None, "display.max_columns", None) # Displays the entire dataframe in nootebook, not just a preview

## Load the CSV files and save in 1 dataframe
csv_files = [f for f in listdir("label_data") if isfile(join("label_data", f))]
all_results = pd.DataFrame()

for i in csv_files:
    temp_df = pd.read_csv("label_data/"+i)
    all_results = all_results.append(temp_df, ignore_index=True)


all_results["Artist"] = all_results["Artist"].str.lower()
all_results = all_results.groupby("Artist").first()

df_labels = df.join(all_results,on="Artist",how="left")
df_labels["BigLabel"] = (df_labels["Label"].notna())

df_overview = df_labels.groupby("Artist").count()
df_overview.sort_values(by="index", ascending=False)

## Save final dataframe
#df_labels.to_pickle("dataframe_w_labels.pkl")
df = df_labels

## Feature data analysis

Histogram analysis for analysing value distribution in the dataset

In [None]:
# df = pd.read_pickle("dataframe.pkl")
hist = df.hist(bins=10, figsize=(15,10))

Correlation analysis for finding any strong relations between all features

In [None]:
corr = df.corr(method='pearson')
#Pearson - Visual heatmap
fig, ax = plt.subplots(figsize=(15,10)) #for size of figure
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap='RdBu_r', annot=True, linewidth=0.5, ax=ax)