In this notebook I combine all the songs that I scraped from the two lyric websites into one CSV for easier access.

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import regex as re
from nltk.corpus import stopwords

## First combining "The Great American Songbook" lyrics

In [2]:
songs = pd.read_csv('datasets/arlen_carmichael_hammerstein_porter.csv')

more_songs = pd.read_csv('datasets/berlin_gershwin_kern_mercer_rodgers_waller.csv')

american_songbook = pd.concat([songs, more_songs], axis=0)

In [3]:
american_songbook.head()

Unnamed: 0,song_writer,song_name,lyrics,song_link
0,Harold Arlen,Ac-Cent-Tchu-Ate The Positive Lyrics,"\n Gather 'round me, everyb...",https://www.lyricsfreak.com/h/harold+arlen/ + ...
1,Harold Arlen,As Long As I Live Lyrics,\n Maybe I can't live to lo...,https://www.lyricsfreak.com/h/harold+arlen/ + ...
2,Harold Arlen,Between The Devil And The Deep Blue Sea Lyrics,\n I don't want you\nBut I ...,https://www.lyricsfreak.com/h/harold+arlen/ + ...
3,Harold Arlen,Blues In The Night Lyrics,"\n My mama done tol' me, wh...",https://www.lyricsfreak.com/h/harold+arlen/ + ...
4,Harold Arlen,Come Rain Or Come Shine Lyrics,"\n I'm gonna love you, like...",https://www.lyricsfreak.com/h/harold+arlen/ + ...


In [4]:
american_songbook.song_writer.value_counts()

Irving Berlin        95
Johnny Mercer        69
Fats Waller          52
George Gershwin      51
Cole Porter          47
Hoagy Carmichael     40
Harold Arlen         37
Oscar Hammerstein    34
Richard Rodgers      11
Jerome Kern           7
Name: song_writer, dtype: int64

## Labeling the lyrics with the appropriate classification ID before combining with Disney.

In [5]:
american_songbook['american_songbook'] = 1

In [6]:
american_songbook['disney_renaissance'] = 0

In [7]:
american_songbook.head()

Unnamed: 0,song_writer,song_name,lyrics,song_link,american_songbook,disney_renaissance
0,Harold Arlen,Ac-Cent-Tchu-Ate The Positive Lyrics,"\n Gather 'round me, everyb...",https://www.lyricsfreak.com/h/harold+arlen/ + ...,1,0
1,Harold Arlen,As Long As I Live Lyrics,\n Maybe I can't live to lo...,https://www.lyricsfreak.com/h/harold+arlen/ + ...,1,0
2,Harold Arlen,Between The Devil And The Deep Blue Sea Lyrics,\n I don't want you\nBut I ...,https://www.lyricsfreak.com/h/harold+arlen/ + ...,1,0
3,Harold Arlen,Blues In The Night Lyrics,"\n My mama done tol' me, wh...",https://www.lyricsfreak.com/h/harold+arlen/ + ...,1,0
4,Harold Arlen,Come Rain Or Come Shine Lyrics,"\n I'm gonna love you, like...",https://www.lyricsfreak.com/h/harold+arlen/ + ...,1,0


## Combining Disney Lyrics

In [8]:
disney_renaissance = pd.read_csv('datasets/disney_renaissance.csv')

Again labeling the the data with the classification ID before combining.

In [9]:
disney_renaissance['disney_renaissance'] = 1

In [10]:
disney_renaissance['american_songbook'] = 0

In [11]:
disney_renaissance.head()

Unnamed: 0,song_writer,song_name,lyrics,song_link,disney_renaissance,american_songbook
0,Tarzan,Son Of Man Lyrics,\n Oh. the power to be stro...,https://www.lyricsfreak.com/d/disneys+tarzan/ ...,1,0
1,Tarzan,Strangers Like Me Lyrics,"\n Whatever you do, I'll do...",https://www.lyricsfreak.com/d/disneys+tarzan/ ...,1,0
2,Tarzan,Two Worlds Lyrics,\n Put your faith in what y...,https://www.lyricsfreak.com/d/disneys+tarzan/ ...,1,0
3,Tarzan,Two Worlds Finale Lyrics,\n Put your faith in what y...,https://www.lyricsfreak.com/d/disneys+tarzan/ ...,1,0
4,Tarzan,Two Worlds Reprise Lyrics,\n Every moment now the bon...,https://www.lyricsfreak.com/d/disneys+tarzan/ ...,1,0


In [12]:
disney = pd.read_csv('datasets/disney_songs.csv')

more_disney = pd.read_csv('datasets/more_disney_songs.csv')

In [13]:
disney_songs = pd.concat([disney, more_disney], axis=0)

In [14]:
disney_songs.head()

Unnamed: 0,song_writer,song_name,lyrics,song_link
0,101 Dalmations,Cruella De Vil,Cruella De Vil Cruella De Vil If she doesn't s...,http://www.disneyclips.com/lyrics/lyrics23.html
1,101 Dalmations,Dalmatian Plantation,We'll have a dalmatian plantation Where our po...,http://www.disneyclips.com/lyrics/lyrics88.html
2,101 Dalmations,Kanine Krunchies,Kanine Krunchies can't be beat They make each ...,http://www.disneyclips.com/lyrics/lyricsdalmat...
3,Alice In Wonderland,Alice in Wonderland,Alice in Wonderland How do you get to Wonderla...,http://www.disneyclips.com/lyrics/alicelyrics3...
4,Alice In Wonderland,All in the Golden Afternoon,Little bread-and-butterflies kiss the tulips A...,http://www.disneyclips.com/lyrics/alicelyrics2...


In [15]:
disney_songs['disney_renaissance'] = 0

In [16]:
disney_songs['american_songbook'] = 0

In [17]:
disney_songbook = pd.concat([disney_songs, disney_renaissance], axis=0)

In [18]:
disney_songbook.head()

Unnamed: 0,song_writer,song_name,lyrics,song_link,disney_renaissance,american_songbook
0,101 Dalmations,Cruella De Vil,Cruella De Vil Cruella De Vil If she doesn't s...,http://www.disneyclips.com/lyrics/lyrics23.html,0,0
1,101 Dalmations,Dalmatian Plantation,We'll have a dalmatian plantation Where our po...,http://www.disneyclips.com/lyrics/lyrics88.html,0,0
2,101 Dalmations,Kanine Krunchies,Kanine Krunchies can't be beat They make each ...,http://www.disneyclips.com/lyrics/lyricsdalmat...,0,0
3,Alice In Wonderland,Alice in Wonderland,Alice in Wonderland How do you get to Wonderla...,http://www.disneyclips.com/lyrics/alicelyrics3...,0,0
4,Alice In Wonderland,All in the Golden Afternoon,Little bread-and-butterflies kiss the tulips A...,http://www.disneyclips.com/lyrics/alicelyrics2...,0,0


In [20]:
disney_songbook['song_writer'].value_counts().sort_index()

101 Dalmations                     3
A Goofy Movie                      6
Aladdin                           24
Alice In Wonderland               12
Bambi                              3
Beauty and the Beast              21
Bedknobs and Broomsticks           6
Brave                              4
Brother Bear                       6
Cinderella                         6
Coco                              10
Dumbo                              6
Enchanted                          5
Frozen                            10
Frozen 2                          14
Hercules                           5
High School Musical               10
Lady and the Tramp                 5
Lion King                         23
Little Mermaid                    34
Mary Poppins                      17
Moana                              9
Mulan                              3
Oliver and Company                 5
Peter Pan                          6
Pinocchio                          5
Pocahontas                        14
R

## Combining all the lyrics together!!

In [21]:
all_lyrics = pd.concat([american_songbook,
           disney_songbook])

In [22]:
# Checking to see if the songbooks are as balanced as possible
all_lyrics['american_songbook'].value_counts(normalize=True)

1    0.542892
0    0.457108
Name: american_songbook, dtype: float64

## Figuring out the code to remove unnessary characters

In [23]:
all_lyrics.iloc[0]

song_writer                                                Harold Arlen
song_name                          Ac-Cent-Tchu-Ate The Positive Lyrics
lyrics                \n                    Gather 'round me, everyb...
song_link             https://www.lyricsfreak.com/h/harold+arlen/ + ...
american_songbook                                                     1
disney_renaissance                                                    0
Name: 0, dtype: object

In [24]:
all_lyrics.iloc[0]['lyrics']

"\n                    Gather 'round me, everybody\nGather 'round me, while I preach some\nFeel a sermon coming on here\nThe topic will be sin\nAnd that's what I'm agin'\nIf you wanna hear my story\nThen settle back and just sit tight\nWhile I start reviewing\nThe attitude of doing right\n\nYou got to ac-cent-tchu-ate the positive\nE-lim-i-nate the negative\nAnd latch on to the affirmative\nDon't mess with mister inbetween\n\nYou got to spread joy up to the maximum\nBring gloom down to the minimum\nAnd have faith, or pandemonium\nLiable to walk upon the scene\n\nTo illustrate my last remark\nJonah in the whale, noah in the ark,\nWhat did they do, just when everything looked so dark?\nMan, they said, we better\n\nAc-cent-tchu-ate the positive\nE-lim-i-nate the negative\nAnd latch on to the affirmative\nDon't mess with mister inbetween                "

In [25]:
example1 = BeautifulSoup(all_lyrics.iloc[0]['lyrics'])

# Print the raw review and then the output of get_text(), for comparison
print(example1.get_text())

Gather 'round me, everybody
Gather 'round me, while I preach some
Feel a sermon coming on here
The topic will be sin
And that's what I'm agin'
If you wanna hear my story
Then settle back and just sit tight
While I start reviewing
The attitude of doing right

You got to ac-cent-tchu-ate the positive
E-lim-i-nate the negative
And latch on to the affirmative
Don't mess with mister inbetween

You got to spread joy up to the maximum
Bring gloom down to the minimum
And have faith, or pandemonium
Liable to walk upon the scene

To illustrate my last remark
Jonah in the whale, noah in the ark,
What did they do, just when everything looked so dark?
Man, they said, we better

Ac-cent-tchu-ate the positive
E-lim-i-nate the negative
And latch on to the affirmative
Don't mess with mister inbetween                


In [26]:
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z'-]",           
                      " ",                  
                      example1.get_text().rstrip())

In [27]:
letters_only

"Gather 'round me  everybody Gather 'round me  while I preach some Feel a sermon coming on here The topic will be sin And that's what I'm agin' If you wanna hear my story Then settle back and just sit tight While I start reviewing The attitude of doing right  You got to ac-cent-tchu-ate the positive E-lim-i-nate the negative And latch on to the affirmative Don't mess with mister inbetween  You got to spread joy up to the maximum Bring gloom down to the minimum And have faith  or pandemonium Liable to walk upon the scene  To illustrate my last remark Jonah in the whale  noah in the ark  What did they do  just when everything looked so dark  Man  they said  we better  Ac-cent-tchu-ate the positive E-lim-i-nate the negative And latch on to the affirmative Don't mess with mister inbetween"

In [28]:
songs['lyrics'][0] = letters_only

In [29]:
songs['lyrics'][0]

"Gather 'round me  everybody Gather 'round me  while I preach some Feel a sermon coming on here The topic will be sin And that's what I'm agin' If you wanna hear my story Then settle back and just sit tight While I start reviewing The attitude of doing right  You got to ac-cent-tchu-ate the positive E-lim-i-nate the negative And latch on to the affirmative Don't mess with mister inbetween  You got to spread joy up to the maximum Bring gloom down to the minimum And have faith  or pandemonium Liable to walk upon the scene  To illustrate my last remark Jonah in the whale  noah in the ark  What did they do  just when everything looked so dark  Man  they said  we better  Ac-cent-tchu-ate the positive E-lim-i-nate the negative And latch on to the affirmative Don't mess with mister inbetween"

In [31]:
lower_case = letters_only.lower()

In [32]:
lower_case

"gather 'round me  everybody gather 'round me  while i preach some feel a sermon coming on here the topic will be sin and that's what i'm agin' if you wanna hear my story then settle back and just sit tight while i start reviewing the attitude of doing right  you got to ac-cent-tchu-ate the positive e-lim-i-nate the negative and latch on to the affirmative don't mess with mister inbetween  you got to spread joy up to the maximum bring gloom down to the minimum and have faith  or pandemonium liable to walk upon the scene  to illustrate my last remark jonah in the whale  noah in the ark  what did they do  just when everything looked so dark  man  they said  we better  ac-cent-tchu-ate the positive e-lim-i-nate the negative and latch on to the affirmative don't mess with mister inbetween"

In [33]:
words = lower_case.split()

In [34]:
words[:10]

['gather',
 "'round",
 'me',
 'everybody',
 'gather',
 "'round",
 'me',
 'while',
 'i',
 'preach']

In [35]:
all_lyrics = all_lyrics.sort_values('song_writer')

In [36]:
all_lyrics.to_csv('datasets/all_lyrics.csv', index=False)

In [37]:
all_lyrics = pd.read_csv('datasets/all_lyrics.csv')

In [38]:
all_lyrics

Unnamed: 0,song_writer,song_name,lyrics,song_link,american_songbook,disney_renaissance
0,101 Dalmations,Kanine Krunchies,Kanine Krunchies can't be beat They make each ...,http://www.disneyclips.com/lyrics/lyricsdalmat...,0,0
1,101 Dalmations,Cruella De Vil,Cruella De Vil Cruella De Vil If she doesn't s...,http://www.disneyclips.com/lyrics/lyrics23.html,0,0
2,101 Dalmations,Dalmatian Plantation,We'll have a dalmatian plantation Where our po...,http://www.disneyclips.com/lyrics/lyrics88.html,0,0
3,A Goofy Movie,Stand Out,"Open up your eyes, take a look at me Get the p...",http://www.disneyclips.com/lyrics/lyricsgoofy....,0,0
4,A Goofy Movie,On the Open Road,[Goofy:] Do ya need a break from modern livin'...,http://www.disneyclips.com/lyrics/lyricsgoofy3...,0,0
...,...,...,...,...,...,...
811,Winnie the Pooh,The Backson Song,"[Owl:] It's a giant creature with a tail Here,...",http://www.disneyclips.com/lyrics/lyricswinnie...,0,0
812,Winnie the Pooh,So Long,"It's not complicated, or very hard to grasp B...",http://www.disneyclips.com/lyrics/lyricswinnie...,0,0
813,Winnie the Pooh,It's Gonna Be Great,[Tigger:] Gonna fix you up; by the time we're ...,http://www.disneyclips.com/lyrics/lyricswinnie...,0,0
814,Winnie the Pooh,Everything Is Honey,"Everything is honey, everywhere I see Everythi...",http://www.disneyclips.com/lyrics/lyricswinnie...,0,0


## Cleaning up the lyrics

In [39]:
# This function has been taken from Matt Brems NLP I lecture. Very little has been changed.
# Only difference is a change in the function name
def lyrics_to_words(raw_text):
    
    text = BeautifulSoup(raw_text).get_text()
    
    letters_only = re.sub("[^a-zA-Z'-]", " ", text)
    
    words = letters_only.lower().split()
    
#     stops = set(stopwords.words('english'))
    
#     meaningful_words = [w for w in words if w not in stops]

    return(" ".join(words))

In [40]:
total_songs = all_lyrics.shape[0]

In [41]:
# This code is heavily borrowed from NLP I lesson by Matt Brems.
# There have been some slight modifications made to fit this project.
# Initialize an empty list to hold the cleaned songs
clean_songs = []

print("Cleaning and parsing the lyrics...")

# Instantiate counter.
j = 0

# Looping through the lyrics...
for songs in all_lyrics['lyrics']:
    
    # Convert lyrics to separate words, then append to clean_songs.
    clean_songs.append(lyrics_to_words(songs))
    
    # If the index is divisible by 50, print a message.
    if (j + 1) % 50 == 0:
        print(f'Lyric {j + 1} of {total_songs}.')
    
    j += 1

Cleaning and parsing the lyrics...
Lyric 50 of 816.
Lyric 100 of 816.
Lyric 150 of 816.
Lyric 200 of 816.
Lyric 250 of 816.
Lyric 300 of 816.
Lyric 350 of 816.
Lyric 400 of 816.
Lyric 450 of 816.
Lyric 500 of 816.
Lyric 550 of 816.
Lyric 600 of 816.
Lyric 650 of 816.
Lyric 700 of 816.
Lyric 750 of 816.
Lyric 800 of 816.


In [42]:
# Creating a dataframe for the clean titles
clean_songs_df = pd.DataFrame(clean_songs, columns=['clean_lyrics'])

In [43]:
clean_songs_df

Unnamed: 0,clean_lyrics
0,kanine krunchies can't be beat they make each ...
1,cruella de vil cruella de vil if she doesn't s...
2,we'll have a dalmatian plantation where our po...
3,open up your eyes take a look at me get the pi...
4,goofy do ya need a break from modern livin' do...
...,...
811,owl it's a giant creature with a tail here i'l...
812,it's not complicated or very hard to grasp but...
813,tigger gonna fix you up by the time we're thro...
814,everything is honey everywhere i see everythin...


In [44]:
all_lyrics.shape

(816, 6)

In [45]:
clean_songs_df.shape

(816, 1)

# Removing all characters and stopwords

I was interested to see how differently the model scores would be if I removed the stopwords. 

In [46]:
# This function has been taken from Matt Brems NLP I lecture. Very little has been changed.
# Only difference is a change in the function name
def lyrics_to_words_stopwords(raw_text):
    
    text = BeautifulSoup(raw_text).get_text()
    
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    
    words = letters_only.lower().split()
    
    stops = set(stopwords.words('english'))
    
    meaningful_words = [w for w in words if w not in stops]

    return(" ".join(words))

In [47]:
# This code is heavily borrowed from NLP I lesson by Matt Brems.
# There have been some slight modifications made to fit this project.
# Initialize an empty list to hold the cleaned songs
clean_songs_stopwords = []

print("Cleaning and parsing the lyrics...")

# Instantiate counter.
j = 0

# Looping through the lyrics...
for songs in all_lyrics['lyrics']:
    
    # Convert lyrics to separate words, then append to clean_songs.
    clean_songs_stopwords.append(lyrics_to_words_stopwords(songs))
    
    # If the index is divisible by 50, print a message.
    if (j + 1) % 50 == 0:
        print(f'Lyric {j + 1} of {total_songs}.')
    
    j += 1

Cleaning and parsing the lyrics...
Lyric 50 of 816.
Lyric 100 of 816.
Lyric 150 of 816.
Lyric 200 of 816.
Lyric 250 of 816.
Lyric 300 of 816.
Lyric 350 of 816.
Lyric 400 of 816.
Lyric 450 of 816.
Lyric 500 of 816.
Lyric 550 of 816.
Lyric 600 of 816.
Lyric 650 of 816.
Lyric 700 of 816.
Lyric 750 of 816.
Lyric 800 of 816.


In [48]:
# Creating a dataframe for the clean titles
clean_songs_stopwords_df = pd.DataFrame(clean_songs_stopwords, columns=['clean_lyrics_with_stopwords'])

In [50]:
clean_songs_stopwords_df

Unnamed: 0,clean_lyrics_with_stopwords
0,kanine krunchies can t be beat they make each ...
1,cruella de vil cruella de vil if she doesn t s...
2,we ll have a dalmatian plantation where our po...
3,open up your eyes take a look at me get the pi...
4,goofy do ya need a break from modern livin do ...
...,...
811,owl it s a giant creature with a tail here i l...
812,it s not complicated or very hard to grasp but...
813,tigger gonna fix you up by the time we re thro...
814,everything is honey everywhere i see everythin...


In [51]:
all_lyrics_clean = pd.concat([all_lyrics, clean_songs_df, clean_songs_stopwords_df], axis=1, join='inner')

In [52]:
all_lyrics_clean.to_csv('datasets/all_lyrics_add_clean_lyrics_col', index=False)