In [22]:
# Imports
import os
import numpy as np
import pandas as pd

#import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

In [23]:
# read in csv
data = pd.read_csv("Data/data.csv")
genre_data = pd.read_csv("Data/data_by_genres.csv")
year_data = pd.read_csv("Data/data_by_year.csv")
artist_data = pd.read_csv("Data/data_by_artist.csv")
genre_w_data = pd.read_csv("Data/data_w_genres.csv")

In [24]:
# Show the tail of "data"
data.tail(10)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
170643,0.907,2020,0.00952,"['DJ Scheme', 'Cordae', 'Ski Mask The Slump Go...",0.917,228333,0.569,1,3C9D1X8NkG2Ak1RaGpRnnQ,0.0,7,0.0774,-10.456,1,Soda (feat. Take A Daytrip),66,2020-11-13,0.279,144.014
170644,0.466,2020,0.31,['Fleet Foxes'],0.562,253613,0.686,0,308prODCCD0O660tIktbUi,0.0225,7,0.125,-8.48,1,Sunblind,66,2020-09-22,0.0249,103.054
170645,0.169,2020,0.994,['Ólafur Arnalds'],0.281,190500,0.0333,0,13MOQ6oQqkrZEDkZOHukCw,0.959,6,0.0995,-31.46,1,We Contain Multitudes (from home),70,2020-08-26,0.0348,90.25
170646,0.522,2020,0.204,['Gunna'],0.598,230600,0.472,1,2f8y4CuG57UJEmkG3ujd0D,1.5e-05,0,0.108,-10.991,1,NASTY GIRL / ON CAMERA,66,2020-05-22,0.258,120.08
170647,0.0838,2020,0.974,['Najma Wallin'],0.175,133500,0.00759,0,6RuFOroO9VO0aMGEzirLHk,0.925,7,0.113,-35.072,1,Med slutna ögon,70,2020-02-21,0.0454,70.872
170648,0.608,2020,0.0846,"['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna...",0.786,301714,0.808,0,0KkIkfsLEJbrcIhYsCL7L5,0.000289,7,0.0822,-3.702,1,China,72,2020-05-29,0.0881,105.029
170649,0.734,2020,0.206,['Ashnikko'],0.717,150654,0.753,0,0OStKKAuXlxA0fMH54Qs6E,0.0,7,0.101,-6.02,1,Halloweenie III: Seven Days,68,2020-10-23,0.0605,137.936
170650,0.637,2020,0.101,['MAMAMOO'],0.634,211280,0.858,0,4BZXVFYCb76Q0Klojq4piV,9e-06,4,0.258,-2.226,0,AYA,76,2020-11-03,0.0809,91.688
170651,0.195,2020,0.00998,['Eminem'],0.671,337147,0.623,1,5SiZJoLXp3WOl3J4C8IK0d,8e-06,2,0.643,-7.161,1,Darkness,70,2020-01-17,0.308,75.055
170652,0.642,2020,0.132,"['KEVVO', 'J Balvin']",0.856,189507,0.721,1,7HmnJHfs0BkFzX4x8j0hkl,0.00471,7,0.182,-4.928,1,Billetes Azules (with J Balvin),74,2020-10-16,0.108,94.991


In [25]:
#Show all of the data types for "data"
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [26]:
# Show the head of "genre_data"
genre_data.head(10)

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7
5,1,abstract,0.45921,0.516167,343196.5,0.442417,0.849667,0.118067,-15.472083,0.046517,127.88575,0.307325,43.5,1
6,1,abstract beats,0.342147,0.623,229936.2,0.5278,0.333603,0.099653,-7.918,0.116373,112.4138,0.493507,58.933333,10
7,1,abstract hip hop,0.243854,0.694571,231849.2,0.646235,0.024231,0.168543,-7.349328,0.214258,108.244987,0.571391,39.790702,2
8,0,accordeon,0.323,0.588,164000.0,0.392,0.441,0.0794,-14.899,0.0727,109.131,0.709,39.0,2
9,1,accordion,0.446125,0.624812,167061.6,0.373437,0.193738,0.1603,-14.487063,0.078537,112.872438,0.658688,21.9375,2


In [27]:
# Show all of the data types for "genre_data"
genre_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2973 entries, 0 to 2972
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   mode              2973 non-null   int64  
 1   genres            2973 non-null   object 
 2   acousticness      2973 non-null   float64
 3   danceability      2973 non-null   float64
 4   duration_ms       2973 non-null   float64
 5   energy            2973 non-null   float64
 6   instrumentalness  2973 non-null   float64
 7   liveness          2973 non-null   float64
 8   loudness          2973 non-null   float64
 9   speechiness       2973 non-null   float64
 10  tempo             2973 non-null   float64
 11  valence           2973 non-null   float64
 12  popularity        2973 non-null   float64
 13  key               2973 non-null   int64  
dtypes: float64(11), int64(2), object(1)
memory usage: 325.3+ KB


In [28]:
# Drop column "release_data" from dataset "data" because of column "year"
data = data.drop(columns=['release_date'])

In [29]:
# Show data
data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,0.038,101.665
