In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [3]:
# Load song data
songs = pd.read_csv('./archive/data.csv')

In [4]:
# Convert all strings in artists to a list of strings
songs['artists'] = songs['artists'].apply(lambda x: literal_eval(x))

In [5]:
# Extract primary artist
songs['primary_artist'] = songs['artists'].apply(lambda x: x[0])

In [6]:
# Extract collaborating artists
songs['collaborators'] = songs['artists'].apply(lambda x: x[1:])

In [7]:
# Create boolean column to define if a song is a collaboration or not
songs['is_collab'] = songs['collaborators'].map(len)>0

In [8]:
songs.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,name,popularity,release_date,speechiness,tempo,valence,year,primary_artist,collaborators,is_collab
0,0.991,[Mamie Smith],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,...,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920,Mamie Smith,[],False
1,0.643,[Screamin' Jay Hawkins],0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,...,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920,Screamin' Jay Hawkins,[],False
2,0.993,[Mamie Smith],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,...,Golfing Papa,4,1920,0.174,97.6,0.689,1920,Mamie Smith,[],False
3,0.000173,[Oscar Velazquez],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,...,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920,Oscar Velazquez,[],False
4,0.295,[Mixe],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,...,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920,Mixe,[],False


In [9]:
# Load genre data
artists = pd.read_csv('./archive/data_w_genres.csv')

In [10]:
# Find the most frequently labeled genres
genres = pd.Series(artists['genres'].apply(lambda x: literal_eval(x)).sum()).value_counts()

In [11]:
# Find top 30 genres
genres = genres[:30]

In [12]:
# Create set of genres
artists['genres'] = artists['genres'].apply(lambda x: set(literal_eval(x)))

In [13]:
genres

rock                  601
pop                   582
dance pop             576
rap                   498
hip hop               487
pop rap               463
country rock          429
urban contemporary    374
pop rock              373
mellow gold           368
soft rock             368
adult standards       360
latin                 353
trap                  346
classic rock          337
pop dance             332
album rock            332
modern rock           329
funk                  323
folk rock             321
new wave pop          312
alternative rock      302
southern hip hop      301
gangster rap          299
country               292
quiet storm           290
r&b                   285
indie pop             285
soul                  285
brill building pop    281
dtype: int64

In [14]:
# Add in missing key genres and assign primary genre based on existence in top 30
def find_gen(sets):
    if len(sets) == 0:
        return 'Unknown'
    for val in genres.index:
        if val in sets:
            return val
    if 'k-pop' in sets:
        return 'k-pop'
    if 'classical' in sets:
        return 'classical'
    if 'tango' in sets:
        return 'tango'
    return 'Other'

artists['primary_genre'] = artists['genres'].apply(lambda x: find_gen(x))

In [15]:
# Clean up names of columns
artists = artists.rename(columns={'artists': 'primary_artist'})

In [16]:
# Join the songs and genre data to assign genre to every song
final_data = songs.merge(artists[['primary_artist', 'primary_genre']], left_on='primary_artist', right_on='primary_artist', how='left')

In [17]:
# Observe final counts for sanity
final_data['primary_genre'].value_counts()

Other                 55960
Unknown               27396
rock                  23117
adult standards       13301
classical             11018
country rock           5471
pop                    5409
rap                    4777
mellow gold            3668
funk                   3203
tango                  2988
latin                  2961
dance pop              2048
country                1509
pop rock               1327
urban contemporary     1317
alternative rock       1287
soul                    925
new wave pop            900
folk rock               877
modern rock             724
soft rock               631
classic rock            601
pop dance               568
brill building pop      490
hip hop                 387
indie pop               317
pop rap                 305
album rock              211
trap                    173
k-pop                   171
quiet storm             156
gangster rap            142
southern hip hop         30
r&b                      12
Name: primary_genre,

In [18]:
# Look at top artists in Other category
final_data[final_data['primary_genre'] == 'Other']['primary_artist'].value_counts()

Johnny Cash                 508
Miles Davis                 495
Lata Mangeshkar             473
Lead Belly                  340
Bob Marley & The Wailers    274
                           ... 
Chancellor                    1
The Great Redneck Hope        1
Audyaroad                     1
Jon Hendricks                 1
Coralie Clement               1
Name: primary_artist, Length: 10500, dtype: int64

In [20]:
# Export data as new dataset ready for visualizations
final_data.to_csv(r'data_master.csv', index=False)