## Read in all the setlist datasets

In [172]:
import pandas as pd
import glob
import os
import fuzzy_pandas as fpd
import numpy as np


path = '/Users/jonzimmerman/Desktop/Data Projects/Setlists/data/setlists/ideal_setlists/'
all_files = glob.glob(os.path.join(path, "*.csv"))

ideal_setlists = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
ideal_setlists.shape


(774, 9)

In [173]:
round2 = ['Florence + the Machine','Little Dragon', 'Passion Pit', 'Lord Huron']
round3 = ['Tame Impala']
ideal_setlists = ideal_setlists[ideal_setlists['Artist Name'].isin(round3)]
ideal_setlists['Artist Name'].unique()

array(['Tame Impala'], dtype=object)

In [174]:
ideal_setlists.head()

Unnamed: 0,Song Name,Closeness Score,Happy Score,Angry Score,Sad Score,Surprise Score,Fear Score,Prioritized Emotion,Artist Name
48,elephant,0.0,0.13,0.07,0.4,0.2,0.2,Happy,Tame Impala
49,solitude is bliss,0.0,0.39,0.0,0.18,0.18,0.25,Happy,Tame Impala
50,mind mischief,0.0,0.27,0.06,0.06,0.33,0.27,Happy,Tame Impala
51,cause im a man,0.0,0.25,0.04,0.33,0.08,0.29,Happy,Tame Impala
52,it is not meant to be,0.0,0.25,0.0,0.29,0.17,0.29,Happy,Tame Impala


In [175]:
ideal_setlists['Prioritized Emotion'].unique()

array(['Happy', 'Surprise', 'Fear', 'Angry', 'Sad', nan], dtype=object)

In [138]:
ideal_setlists[(ideal_setlists['Artist Name']=="Passion Pit") & (ideal_setlists['Prioritized Emotion'].isna())]

Unnamed: 0,Song Name,Closeness Score,Happy Score,Angry Score,Sad Score,Surprise Score,Fear Score,Prioritized Emotion,Artist Name
269,where we belong,0.13,0.14,0.11,0.25,0.18,0.32,,Passion Pit
270,the reeling,0.12,0.17,0.08,0.17,0.21,0.38,,Passion Pit
271,carried away,0.11,0.09,0.15,0.38,0.12,0.25,,Passion Pit
272,better things,0.19,0.04,0.12,0.24,0.29,0.31,,Passion Pit
273,my brother taught me how to swim,0.23,0.11,0.15,0.35,0.2,0.19,,Passion Pit
274,whole life story,0.16,0.03,0.1,0.41,0.21,0.24,,Passion Pit
275,constant conversations,0.23,0.2,0.03,0.31,0.11,0.34,,Passion Pit
276,to kingdom come,0.165,0.02,0.11,0.41,0.22,0.24,,Passion Pit
277,ill be alright,0.24,0.15,0.19,0.33,0.04,0.3,,Passion Pit
278,hideaway,0.17,0.21,0.12,0.21,0.12,0.33,,Passion Pit


## Read in all the album datasets

In [179]:
path = '/Users/jonzimmerman/Desktop/Data Projects/Setlists/data/albums/'
all_files = glob.glob(os.path.join(path, "*.xlsx"))

albums = pd.concat((pd.read_excel(f) for f in all_files), ignore_index=True)
albums.shape

(664, 3)

In [180]:
round2 = ['Little Dragon', 'Florence + the Machine', 'Lord Huron', 'Passion Pit']
round3 = ['Tame Impala']
albums = albums[albums['artist'].isin(round3)]
albums['artist'].value_counts()

artist
Tame Impala    48
Name: count, dtype: int64

## Join albums data onto setlists data

In [181]:
ideal_setlists

Unnamed: 0,Song Name,Closeness Score,Happy Score,Angry Score,Sad Score,Surprise Score,Fear Score,Prioritized Emotion,Artist Name
48,elephant,0.000,0.13,0.07,0.40,0.20,0.20,Happy,Tame Impala
49,solitude is bliss,0.000,0.39,0.00,0.18,0.18,0.25,Happy,Tame Impala
50,mind mischief,0.000,0.27,0.06,0.06,0.33,0.27,Happy,Tame Impala
51,cause im a man,0.000,0.25,0.04,0.33,0.08,0.29,Happy,Tame Impala
52,it is not meant to be,0.000,0.25,0.00,0.29,0.17,0.29,Happy,Tame Impala
...,...,...,...,...,...,...,...,...,...
628,the less i know the better,0.270,0.22,0.04,0.19,0.33,0.22,,Tame Impala
629,patience,0.320,0.09,0.05,0.23,0.23,0.41,,Tame Impala
630,lucidity,0.305,0.08,0.23,0.23,0.15,0.31,,Tame Impala
631,cause im a man,0.320,0.25,0.04,0.33,0.08,0.29,,Tame Impala


In [144]:
albums

Unnamed: 0,album,song,artist
0,Little Dragon,Twice,Little Dragon
1,Little Dragon,Turn Left,Little Dragon
2,Little Dragon,No Love,Little Dragon
3,Little Dragon,Recommendation,Little Dragon
4,Little Dragon,Constant Surprises,Little Dragon
...,...,...,...
611,Tremendous Sea of Love,I'm Perfect,Passion Pit
612,Tremendous Sea of Love,You Have the Right,Passion Pit
613,Tremendous Sea of Love,Inner Dialogue,Passion Pit
614,Tremendous Sea of Love,Undertow,Passion Pit


In [182]:
full_df = fpd.fuzzy_merge(ideal_setlists, albums,
            left_on=['Artist Name','Song Name'],
            right_on=['artist','song'],
            method='levenshtein',
            threshold=0.7,
            join = 'left-outer',
            keep = 'all'
            )

full_df = full_df.drop(columns=['artist', 'song'])
print('Before: ',ideal_setlists.shape[0])
print('After:', full_df.shape[0])

Before:  78
After: 78


## Label non-prioritized subsets as 'None'

In [183]:
full_df['Prioritized Emotion'] = full_df['Prioritized Emotion'].fillna('None')
full_df['Prioritized Emotion'].value_counts()

Prioritized Emotion
Happy       13
Surprise    13
Fear        13
Angry       13
Sad         13
None        13
Name: count, dtype: int64

In [184]:
missing_albums = full_df[full_df['album']==''][['Artist Name','Song Name']]
missing_albums[missing_albums['Artist Name'] == "Tame Impala"]

Unnamed: 0,Artist Name,Song Name
65,Tame Impala,sundown syndrome
66,Tame Impala,patience
67,Tame Impala,patience
68,Tame Impala,sundown syndrome
69,Tame Impala,forty one mosquitoes flying in formation
70,Tame Impala,half full glass of wine
71,Tame Impala,forty one mosquitoes flying in formation
72,Tame Impala,patience
73,Tame Impala,sundown syndrome
74,Tame Impala,forty one mosquitoes flying in formation


## Spruce up some of the artist-song combinations

In [153]:
full_df['album'] = np.where(full_df['Song Name']=='i lied', 'Long Lost', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='we went wild', 'NA', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='the problem with your daughter', 'NA', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='your other life', 'Long Lost', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='son of a gun', 'NA', full_df['album'])

full_df['album'] = np.where(full_df['Song Name']=='dont cry', 'Season High', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='a new', 'Machine Dreams', full_df['album'])


full_df['album'] = np.where(full_df['Song Name']=='american blood', 'Gossamer', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='better things', 'NA', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='dreams', 'Manners', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='cuddle fuddle', 'NA', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='smile upon me', 'NA', full_df['album'])

full_df['album'] = np.where(full_df['Song Name']=='mermaids', 'Dance Fever', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='st jude', 'How Big, How Blue, How Beautiful', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='flakes', 'Lungs', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='if i had a heart', 'NA', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='only love can break your heart', 'NA', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='falling', 'Lungs', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='are you hurting the one you love', 'Lungs', full_df['album'])
full_df['album'] = np.where(full_df['Song Name']=='bedroom hymns', 'Ceremonials', full_df['album'])


## Join lyrics back on to full_df

In [185]:
path = '/Users/jonzimmerman/Desktop/Data Projects/Setlists/data/lyrics/'
lyrics = pd.read_csv(path + 'lyrics_cleaned_final_2024-05-29.csv')
lyrics.shape

(1150, 12)

In [186]:
lyrics = lyrics[['artist','title','cleaned_lyrics']]

full_df_lyrics = fpd.fuzzy_merge(
    full_df, 
    lyrics,
    left_on=['Artist Name','Song Name'],
    right_on=['artist','title'],
    method='levenshtein',
    threshold=0.6,
    join = 'left-outer'
)

full_df_lyrics = full_df_lyrics.drop(columns=['artist', 'title'])
print('Before:',full_df.shape)
print('After:',full_df_lyrics.shape)

Before: (78, 10)
After: (170, 11)


### Remove duplicates that were created in process

In [187]:
dup_df_lyrics = full_df_lyrics
full_df_lyrics = full_df_lyrics.drop_duplicates(keep='first')

print('Before: ',dup_df_lyrics.shape[0])
print('After: ',full_df_lyrics.shape[0])

Before:  170
After:  85


In [188]:
full_df_lyrics.head()

Unnamed: 0,Song Name,Closeness Score,Happy Score,Angry Score,Sad Score,Surprise Score,Fear Score,Prioritized Emotion,Artist Name,album,cleaned_lyrics
0,elephant,0.0,0.13,0.07,0.4,0.2,0.2,Happy,Tame Impala,Lonerism,well he feels like an elephant shaking his big...
2,solitude is bliss,0.0,0.39,0.0,0.18,0.18,0.25,Happy,Tame Impala,Innerspeaker,cracks in the pavement underneath my shoe i ca...
4,mind mischief,0.0,0.27,0.06,0.06,0.33,0.27,Happy,Tame Impala,Lonerism,feels like my life is ready to blow me and my ...
6,cause im a man,0.0,0.25,0.04,0.33,0.08,0.29,Happy,Tame Impala,Currents,like the brutal morning sun it dawns on me wha...
8,it is not meant to be,0.0,0.25,0.0,0.29,0.17,0.29,Happy,Tame Impala,Innerspeaker,i wanted her i wanted her but she doesnt like ...


In [189]:
#### Create matrix of counts
contingency_table = pd.crosstab(full_df_lyrics['Artist Name'], full_df_lyrics['Prioritized Emotion'])
contingency_table

Prioritized Emotion,Angry,Fear,Happy,None,Sad,Surprise
Artist Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tame Impala,15,14,14,14,14,14


In [190]:
full_df_lyrics.to_csv('full_df_lyrics_round3.csv',index=False)

## Inspect Setlists with extra songs

In [167]:
full_df_lyrics[(full_df_lyrics['Artist Name']=="Florence + the Machine") & (full_df_lyrics['Prioritized Emotion']=="Angry")]

Unnamed: 0,Song Name,Closeness Score,Happy Score,Angry Score,Sad Score,Surprise Score,Fear Score,Prioritized Emotion,Artist Name,album,cleaned_lyrics
283,ship to wreck,0.0,0.11,0.03,0.51,0.18,0.17,Angry,Florence + the Machine,"How Big, How Blue, How Beautiful",dont touch the sleeping pills they mess with m...
284,100 years,0.0,0.09,0.04,0.39,0.15,0.33,Angry,Florence + the Machine,High as Hope,i believe in you and in our hearts we know the...
285,breaking down,0.0,0.07,0.04,0.21,0.5,0.18,Angry,Florence + the Machine,Ceremonials,all alone it was always there you see and even...
286,leave my body,0.0,0.02,0.06,0.33,0.27,0.31,Angry,Florence + the Machine,Ceremonials,im gonna be released from behind these lines a...
287,lover to lover,0.0,0.2,0.07,0.23,0.3,0.2,Angry,Florence + the Machine,Ceremonials,ive been losing sleep ive been keeping myself ...
288,rabbit heart (raise it up),0.0,0.07,0.07,0.23,0.37,0.27,Angry,Florence + the Machine,Lungs,the looking glass so shiny and new how quickly...
289,never let me go,0.01,0.15,0.05,0.2,0.27,0.34,Angry,Florence + the Machine,Ceremonials,looking up from underneath fractured moonlight...
290,drumming song,0.0,0.0,0.07,0.2,0.28,0.44,Angry,Florence + the Machine,Lungs,theres a drumming noise inside my head that st...
291,what the water gave me,0.01,0.14,0.05,0.33,0.19,0.29,Angry,Florence + the Machine,Ceremonials,time it took us to where the water was that’s ...
292,what the water gave me,0.01,0.14,0.05,0.33,0.19,0.29,Angry,Florence + the Machine,Ceremonials,time it took us to where the water was thats w...


In [62]:
full_df_lyrics.to_csv('full_df_lyrics.csv')

## Which songs don't have lyrics?

In [55]:
full_df_lyrics[full_df_lyrics['cleaned_lyrics']=='']['Song Name'].value_counts()

Series([], Name: count, dtype: int64)

In [170]:
song = lyrics[lyrics['title'].str.contains('if i had a heart', case = False, na = False)].head(1)

song = song.rename(
    columns={'title': 'Song Name', 
             'artist': 'Artist Name'}
)

insert = song['cleaned_lyrics'].values[0]
insert

'if i had a voice i would voice i would say id promise you the stars up above you promise you my gold diamond ring and we sing oooh ooh ohh oh oh ohhhh ohh oh oh and we sing oooh ooh ohh oh oh ohhhh ohh oh it seems that i had been held in some dreaming state a tourist in the waking world never quite awake and no kiss no gentle word can wake me from this slumber until i realized that it was you who held me under felt it in my fist in my feet in the hollows of my eyelids shaking through my skull through my spine and down through my ribs no more dreaming the dead as if death itself was undone no more calling like a crow for a boy for a body in the garden no calling like a crow so in love so in love no more dreaming like a girl so in love so in love no more dreaming like a girl so in love with with the wrong world and i could here the thunder and see the lightening crack and all around the world was waking i never could go back cause all the walls of dreaming they were torn wide open and f

### Add lyrics back in

In [171]:
full_df_lyrics[full_df_lyrics['Song Name']=="if i had a heart"]

Unnamed: 0,Song Name,Closeness Score,Happy Score,Angry Score,Sad Score,Surprise Score,Fear Score,Prioritized Emotion,Artist Name,album,cleaned_lyrics
452,if i had a heart,0.19,0.26,0.09,0.16,0.19,0.31,,Florence + the Machine,,
453,if i had a heart,0.02,0.26,0.09,0.16,0.19,0.31,Sad,Florence + the Machine,,
454,if i had a heart,0.0,0.26,0.09,0.16,0.19,0.31,Surprise,Florence + the Machine,,


In [53]:
full_df_lyrics['cleaned_lyrics'] = np.where(full_df_lyrics['Song Name']=="5/4", insert, full_df_lyrics['cleaned_lyrics'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_df_lyrics['cleaned_lyrics'] = np.where(full_df_lyrics['Song Name']=="5/4", insert, full_df_lyrics['cleaned_lyrics'])
