# Lyrics Cleaning

## Setup

In [1]:
import numpy as np
import pandas as pd
import re
import json

## List of albums to process

In [2]:
# List of albums that we have lyrics for
checklist = [
    'Marry Me', 
    'Actor',
    'Strange Mercy',
    'St. Vincent (Deluxe Edition)',
    'MASSEDUCTION',
    'Love This Giant',
    'Daddy\'s Home',
    'american dream',
    'This Is Happening',
    'Sound of Silver',
    'LCD Soundsystem',
    '45:33',
    'Lamp Lit Prose',
    'Dirty Projectors',
    'Swing Lo Magellan',
    'Mount Wittenberg Orca',
    'Bitte Orca',
    'Rise Above',
    'The Getty Address',
    'Slaves\' Graves and Ballads',
    'The Glad Fact',
    'Morning Better Last!',
    'Moon 2',
    'Infinite House',
    'Electric Balloon',
    'Ice Level',
    'Pang',
    'Desire, I Want To Turn Into You',
    'Al Mundo Azul',
    'Salt',
    'Mr Twin Sister',
    'In Heaven',
    'CINEMA',
    'Night Pass',
    'Pray for Rain',
    'Moon Tides',
    'Offering',
    'Host',
    'Static',
    'Cults',
    'Sleep Well Beast',
    'Trouble Will Find Me',
    'High Violet',
    'Boxer',
    'Alligator',
    'Preacher’s Daughter',
    'Inbred',
    'Golden Age',
    'Love Is Dead',
    'Every Open Eye',
    'The Bones of What You Believe (Special Edition)',
    'Dragon New Warm Mountain I Believe In You',
    'Two Hands',
    'U.F.O.F.',
    'Capacity',
    'Masterpiece (2023 Remaster)',
    'The Shape of Brat Pop to Come',
    'The Theatrical Death of Julie Delicious',
    'Cool It Down',
    'Mosquito (Deluxe)',
    'It\'s Blitz!',
    'Show Your Bones',
    'Fever To Tell',
    'Loss Of Life',
    'Little Dark Age',
    'MGMT',
    'Congratulations',
    'Oracular Spectacular',
    'A Moon Shaped Pool',
    'The King Of Limbs',
    'In Rainbows',
    'Hail To the Thief',
    'Amnesiac',
    'Kid A',
    'OK Computer',
    'The Bends',
    'Pablo Honey',
    'Formentera II',
    'Formentera',
    'Art of Doubt',
    'Pagans in Vegas',
    'Synthetica',
    'Fantasies',
    'Grow Up And Blow Away',
    'Live It Out',
    'Old World Underground, Where Are You Now?',
    'Hug Of Thunder',
    'Forgiveness Rock Record',
    'Broken Social Scene',
    'Bee Hives',
    'You Forgot It In People',
    'Feel Good Lost'
]

## Read Data

In [3]:
with open('data_with_lyrics.json') as f:
    data = json.load(f)
f.close()

## Remove albums we didn't get lyrics for

### Create list of (artist, album) pairs we want to remove

In [4]:
remove_albums = []
# Loop over artists in json
for artist in data.keys():
    # Loop over albums for artist
    for data_album in data[artist]['Albums'].keys():
        # Check if we have this ablum in checklist
        if data_album not in checklist:
            remove_albums.append((artist, data_album))

In [5]:
remove_albums

[('St. Vincent', 'The Nowhere Inn'),
 ('St. Vincent', 'Nina Kraviz Presents MASSEDUCTION Rewired'),
 ('St. Vincent', 'MassEducation'),
 ('LCD Soundsystem', 'Electric Lady Sessions'),
 ('LCD Soundsystem',
  'the long goodbye (lcd soundsystem live at madison square garden)'),
 ('LCD Soundsystem', 'This Is Happening (Deluxe Edition)'),
 ('LCD Soundsystem', 'London Sessions'),
 ('LCD Soundsystem', 'Introns'),
 ('Dirty Projectors', 'Mount Wittenberg Orca (Expanded Edition)'),
 ('Dirty Projectors', '5EPs'),
 ('Dirty Projectors', 'Sing The Melody'),
 ('Dirty Projectors', 'Bitte Orca (Expanded Edition)'),
 ('Ava Luna', 'Live at Market Hotel'),
 ('Ava Luna', 'Services / 3rd Avenue Island'),
 ('Caroline Polachek', 'Desire, I Want To Turn Into You: Everasking Edition'),
 ('Pure Bathing Culture', 'Chalice'),
 ('Pure Bathing Culture', 'Hats'),
 ('Cults', 'Static (10th Anniversary Edition)'),
 ('Cults', 'Host B Sides & Remixes'),
 ('The National', 'Laugh Track'),
 ('The National', 'First Two Pages o

### Remove these albums from data

In [6]:
for pair in remove_albums:
    del data[pair[0]]['Albums'][pair[1]]

### Check that it worked

In [7]:
# Loop over artists in json
for artist in data.keys():
    # Loop over albums for artist
    for data_album in data[artist]['Albums'].keys():
        # Check if we have this ablum in checklist
        if data_album not in checklist:
            print(data_album)

## Loop over albums in checklist and process lyrics

In [8]:
remove_songs = []
length_threshold = 300  # Used to ignore the odd play that shows up in the lyrics......
# Loop over artists in json
for artist in data.keys():
    # Loop over albums for artist
    for data_album in data[artist]['Albums'].keys():
        # Loop over checklist albums to subset to albums we have lyrics for
        for check_album in checklist:
            # Make sure albums match
            if data_album == check_album:
                # Get tracklist for albums we have lyrics for
                for track in data[artist]['Albums'][data_album]['tracklist'].keys():
                    # Get lyrics for track
                    print(f'Cleaned {artist, data_album, track}')
                    song_lyrics = data[artist]['Albums'][data_album]['tracklist'][track]['lyrics']
                    # Check to see if ther are lyrics, only keep songs we have lyrics for
                    if song_lyrics != 'is_instrumental_or_did_not_find':
                        # Clean lyrics
                        song_lyrics = re.sub(fr'(?i)\d+(.*?lyrics)', '', song_lyrics)
                        song_lyrics = re.sub(fr'(?<!\s)Embed$', '', song_lyrics)
                        song_lyrics = re.sub(fr'(?i)you might also like', '', song_lyrics)
                        song_lyrics = re.sub(fr'(?i)See\s(.*?)\sLiveGet\stickets\sas\slow\sas\s\$\d+', '', song_lyrics)
                        song_lyrics = re.sub(fr'(?i)\[instrumental (.*?)\]', '', song_lyrics)
                        song_lyrics = re.sub(fr'(?i)\[interlude\]', '', song_lyrics)
                        song_lyrics = re.sub(fr'(?i)\w+(\d+)\b', '', song_lyrics)
                        song_lyrics = re.sub(fr'[.,;:"?!()-]\d+', '', song_lyrics)
                        song_lyrics = re.sub(fr'  ', '', song_lyrics)
                        song_lyrics = re.sub(fr'  ', '', song_lyrics)
                        song_lyrics = re.sub(fr'  ', '', song_lyrics)
                        song_lyrics = re.sub(fr'\[(.*?)\]', '', song_lyrics)    # Cleans out any [Verse] etc,
                        song_lyrics = re.sub(fr'\s\s', '\n', song_lyrics)       # Standardizes whitespace to single new line
                        # Check length of song because there are some weird texts that make it in here
                        if len(song_lyrics.split()) < 1000:
                            data[artist]['Albums'][data_album]['tracklist'][track]['lyrics'] = song_lyrics
                        else: remove_songs.append(((artist, data_album, track)))
                    # Keep info on songs we don't have lyrics for to remove later
                    else:
                        remove_songs.append((artist, data_album, track))

Cleaned ('St. Vincent', "Daddy's Home", 'Pay Your Way In Pain')
Cleaned ('St. Vincent', "Daddy's Home", 'Down And Out Downtown')
Cleaned ('St. Vincent', "Daddy's Home", "Daddy's Home")
Cleaned ('St. Vincent', "Daddy's Home", 'Live In The Dream')
Cleaned ('St. Vincent', "Daddy's Home", 'The Melting Of The Sun')
Cleaned ('St. Vincent', "Daddy's Home", 'Humming - Interlude 1')
Cleaned ('St. Vincent', "Daddy's Home", 'The Laughing Man')
Cleaned ('St. Vincent', "Daddy's Home", 'Down')
Cleaned ('St. Vincent', "Daddy's Home", 'Humming - Interlude 2')
Cleaned ('St. Vincent', "Daddy's Home", 'Somebody Like Me')
Cleaned ('St. Vincent', "Daddy's Home", 'My Baby Wants A Baby')
Cleaned ('St. Vincent', "Daddy's Home", '…At The Holiday Party')
Cleaned ('St. Vincent', "Daddy's Home", 'Candy Darling')
Cleaned ('St. Vincent', "Daddy's Home", 'Humming - Interlude 3')
Cleaned ('St. Vincent', 'MASSEDUCTION', 'Hang On Me')
Cleaned ('St. Vincent', 'MASSEDUCTION', 'Pills')
Cleaned ('St. Vincent', 'MASSEDUCTIO

In [9]:
data['St. Vincent']['Albums']['Daddy\'s Home']['tracklist']['Pay Your Way In Pain']['lyrics']

'\n(Ow)\nOh-oh-oh\n\nYou got to pay your way in pain\nYou got to pray your way in shame (Yeah, ow)\n\nI went to the store, I was feelin\' kinda hungry\nBut I didn\'t have the money and the shelves were all empty\nSo I went to the bank to ch-ch-ch-check my checking\nThe man looked at my face, said, "We don\'t have a record"\nOh no, you thought we had forgotten?\nThe show is only gettin\' started\nThe road is feelin\' like a pothole\nSit down, stand up, head down, hands up, and\n\nPay your way in pain\nYou got to pray your way in shame (Uh-huh)\n\nDo you know what I want? (What do you want? What do you want?)\nYou know what I want (What do you want? What do you want?)\nYou know what I want (What do you want? What do you want?)\nKeep the rest, baby, ah, ah\nI wanna be loved\nPay, pain\nPray, shame\nSo I went to the park just to watch the little children\nThe mothers saw my heels and they said I wasn\'t welcome\nSo I, I went back home, I was feelin\' kinda queasy\nBut all the locks were ch

In [10]:
remove_songs

[('St. Vincent', "Daddy's Home", 'Humming - Interlude 2'),
 ('St. Vincent', 'MASSEDUCTION', 'Dancing With A Ghost'),
 ('St. Vincent', 'Marry Me', 'We Put a Pearl in the Ground'),
 ('LCD Soundsystem', 'american dream', 'pulse (v.1)'),
 ('LCD Soundsystem',
  '45:33',
  'North American Scum - Onanistic Dub Mix by James Murphy and Eric Broucek'),
 ('LCD Soundsystem', '45:33', 'Hippie Priest Bum-Out'),
 ('Dirty Projectors', 'Mount Wittenberg Orca', 'Ocean'),
 ('Dirty Projectors', "Slaves' Graves and Ballads", 'Hazard Lights (Reprise)'),
 ('Dirty Projectors', 'Morning Better Last!', 'The Softer Shell'),
 ('Dirty Projectors', 'Morning Better Last!', 'Hildegard Vs. Beach Boys'),
 ('Dirty Projectors', 'Morning Better Last!', 'How Does My Mind Work?'),
 ('Dirty Projectors', 'Morning Better Last!', 'The Disordered Sprawl'),
 ('Ava Luna', 'Moon 2', 'Moon'),
 ('Ava Luna', 'Infinite House', 'Infinite House'),
 ('Ava Luna', 'Ice Level', 'Stages'),
 ('Ava Luna', 'Ice Level', 'A Year of Mirth'),
 ('Ava

### Remove songs we don't have lyrics for for a clean corpus

In [11]:
for trio in remove_songs:
    del data[trio[0]]['Albums'][trio[1]]['tracklist'][trio[2]]

## Check lyrics for remaning songs

In [15]:
# Loop over artists in json
for artist in data.keys():
    # Loop over albums for artist
    for data_album in data[artist]['Albums'].keys():
        # Get tracklist for albums we have lyrics for
        for track in data[artist]['Albums'][data_album]['tracklist'].keys():
            print(artist, data_album, track)
            print(data[artist]['Albums'][data_album]['tracklist'][track]['lyrics'])

St. Vincent Daddy's Home Pay Your Way In Pain

(Ow)
Oh-oh-oh

You got to pay your way in pain
You got to pray your way in shame (Yeah, ow)

I went to the store, I was feelin' kinda hungry
But I didn't have the money and the shelves were all empty
So I went to the bank to ch-ch-ch-check my checking
The man looked at my face, said, "We don't have a record"
Oh no, you thought we had forgotten?
The show is only gettin' started
The road is feelin' like a pothole
Sit down, stand up, head down, hands up, and

Pay your way in pain
You got to pray your way in shame (Uh-huh)

Do you know what I want? (What do you want? What do you want?)
You know what I want (What do you want? What do you want?)
You know what I want (What do you want? What do you want?)
Keep the rest, baby, ah, ah
I wanna be loved
Pay, pain
Pray, shame
So I went to the park just to watch the little children
The mothers saw my heels and they said I wasn't welcome
So I, I went back home, I was feelin' kinda queasy
But all the lock

## There are some weird texts still
Create a manual list to drop them

In [13]:
manual_song_drops = [
    ('Metric', 'Pagans in Vegas', 'The Face Part II'),
    ('Ethel Cain', 'Inbred', 'God\'s Country (feat. Wicca Phase Springs Eternal)'),
    ('Ethel Cain', 'Inbred', 'Michelle Pfeiffer (feat. lil aaron)'),
    ('MGMT', 'Loss Of Life', 'Dancing In Babylon (feat. Christine and the Queens)')
]

In [14]:
for trio in manual_song_drops:
    del data[trio[0]]['Albums'][trio[1]]['tracklist'][trio[2]]

## Count number of songs left

In [16]:
song_count = 0
# Loop over artists in json
for artist in data.keys():
    # Loop over albums for artist
    for data_album in data[artist]['Albums'].keys():
        # Get tracklist for albums we have lyrics for
        for track in data[artist]['Albums'][data_album]['tracklist'].keys():
            song_count += 1
print(f'Number of songs: {song_count}')

Number of songs: 985


## Save cleaned data

In [17]:
with open('cleaned_data.json', 'w') as outfile:
    json.dump(data, outfile)