# Meaning-Based Music Recommendation

#### Summary:
- Use natural language processing for music recommendation
- Identify and suggest music with similar meanings as songs entered by the user

#### Key Questions:
- How do we obtain song 'meanings'?
- How do we measure the similarities between songs based on texts related to song 'meanings'?
- How do we produce a list of recommendations for a given user?

---
## Section 1: Gather song lyrics and Songmeanings.com comments/interpretations

Songmeanings.com contains <b>1,097,225 song lyrics</b> from <b>110,886 artists</b>, and <b>1,737,493 comments/interpretations</b>. To obtain song lyrics/comments, we need to obtain the URLs to the song lyrics/comments page for ~1 million songs. We will use a small premade website index info file (sm_start_info.xlsx) to build out a list of URLs to the A-Z artist directory pages. From there, we will use BeautifulSoup to webscrape the <b>URLs of all 110,886 artist pages</b>, which in turn we will webscrape to obtain the <b>URLs of all 1,097,225 song pages</b>.

In [1]:
%matplotlib inline

# Imports
import numpy as np
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
from IPython.display import clear_output
from time import sleep

In [2]:
# Load Songmeanings.com directory info 
start_info = pd.read_excel('sm_start_info.xlsx')
start_info.head()

Unnamed: 0,letter,pages
0,a,155
1,b,170
2,c,153
3,d,161
4,e,76


In [3]:
# Build list of URLs to access artist directories
artist_dir_urls = []
for letter, max_page in start_info.values:
    for page in np.arange(1,max_page+1):
        artist_dir_urls.append('https://songmeanings.com/artist/directory/'+letter+'/?page='+str(page))

![](artist_dir_page.png)

In [4]:
# Build list of URLs to artist pages 
SCRAPE_ARTIST_URLS = False # Set to False if scraping is already completed

if SCRAPE_ARTIST_URLS == True:
    artist_urls = []
    for artists_page in artist_dir_urls:
        page = requests.get(artists_page)
        soup = BeautifulSoup(page.text, 'lxml')
        table = soup.find('table', summary='songs table')
        table_body = table.find('tbody')
        for item in table_body.find_all('a', href=True):
            artist_urls.append([(item['title']),(item['href'])])
        print ('Scraping artist URLs: '+artists_page)
        clear_output(wait=True)
        sleep(1)
    # Save artist URL list
    artist_urls_df = pd.DataFrame(artist_urls)
    artist_urls_df.to_csv('artist_urls.csv', index=False, header=False)
else:
    artist_urls_df = pd.read_csv('artist_urls.csv', header=None)

In [5]:
artist_urls_df.head()

Unnamed: 0,0,1
0,A,//songmeanings.com/artist/view/songs/6024/
1,A B,//songmeanings.com/artist/view/songs/137439043...
2,A B & The Sea,//songmeanings.com/artist/view/songs/137438996...
3,A Backward Glance On A Travel Road,//songmeanings.com/artist/view/songs/137439060...
4,A Balladeer,//songmeanings.com/artist/view/songs/137438972...


![](artist_page.png)

In [6]:
# Build list of URLs to song pages 
SCRAPE_SONG_URLS = False # Set to False if scraping is already completed

if SCRAPE_SONG_URLS == True:
    # Save checkpoints
    checkpoints = np.arange(0,110000,1000)

    for count, artist in enumerate(artist_urls_df.values):
        page = requests.get('http:'+artist[1])
        soup = BeautifulSoup(page.text, 'lxml')
        table = soup.find('table', summary='songs table')
        if table != None:
            table_body = table.find('tbody')
            for item in table_body.find_all('tr'):
                song_name = item.find_all('a')[0].contents[0]
                song_url = item.find_all('a')[0]['href']
                num_comments = item.find_all('a')[1].contents[0]
                song_urls.append([artist[0], song_name, song_url, num_comments])
        print('Scraping song URLs for artist: '+artist[0])
        clear_output(wait=True)
        sleep(1)

        # Save at checkpoint
        if count in checkpoints:
            song_urls_df = pd.DataFrame(song_urls)
            song_urls_df.to_csv('song_urls.csv', index=False, header=False)

    song_urls_df = pd.DataFrame(song_urls)
    song_urls_df.to_csv('song_urls.csv', index=False, header=False)
else:
    song_urls_df = pd.read_csv('song_urls.csv', header = None)

In [7]:
song_urls_df.head()

Unnamed: 0,0,1,2,3
0,A,"""meditacao"" - (eng: Meditation)",//songmeanings.com/songs/view/91590/,0
1,A,2nd Coming,//songmeanings.com/songs/view/3530822107859283...,0
2,A,40,//songmeanings.com/songs/view/3530822107858486...,0
3,A,6 O'clock On A Tube Stop,//songmeanings.com/songs/view/93649/,1
4,A,8 Fingers,//songmeanings.com/songs/view/3530822107858486...,2


![](song_page.png)

In [8]:
# Functions to extract strings (exclude tagged items) from soup items
def extract_str(soup_item):
    string = []
    if soup_item is not None:
        for item in soup_item:
            if isinstance(item,bs4.element.NavigableString):
                string.append(item)
        joined_string = ' '.join(string)
        return joined_string

def strip_header_literals(string):
    if string is not None:
        output = string.replace('\n','',1).replace('\t','')
        return output

In [9]:
# Webscrape song lyrics and user comments/interpretations
SCRAPE_TEXT = False # Set to False if scraping is already completed

if SCRAPE_TEXT == True:    
    song_lyrics = []
    song_comments = []
    song_ID_counter = 1

    checkpoints = np.arange(0,len(song_urls_df[song_urls_df[3]>10]),1000)

    # Go through song list
    for song in song_urls_df.values: 
        if song[3]>=10: # If song has 10 or more comments

            # scrape song page
            page = requests.get('http:'+song[2])
            soup = BeautifulSoup(page.text, 'lxml')

            # lyrics
            lyrics_box = soup.find('div', {'class': "holder lyric-box"})
            lyrics = strip_header_literals(extract_str(lyrics_box))
            # save song lyrics (song ID, artist, song name, lyrics)
            song_lyrics.append([song_ID_counter, song[0],song[1],lyrics])

            #comments
            raw_comments = soup.find('ul', {'class': "comments-list"})
            comments = []
            if raw_comments is not None: 
                for raw_comment_count, raw_comment in enumerate(raw_comments.find_all('li')):
                    if raw_comment_count % 2 == 0: # skip odd items, which are replies to comments
                        comments.append(strip_header_literals(extract_str((raw_comment.find('div', {'class':'text'})))))
                # save song comments (song ID, artist, song name, comment)
                for comment in comments:
                    song_comments.append([song_ID_counter, song[0],song[1],comment])

            # increase song ID counter
            song_ID_counter += 1

            # Save at checkpoint
            if song_ID_counter in checkpoints:
                song_lyrics_df = pd.DataFrame(song_lyrics)
                song_lyrics_df.to_csv('song_lyrics.csv', index=False, header=False)

                song_comments_df = pd.DataFrame(song_comments)
                song_comments_df.to_csv('song_comments.csv', index=False, header=False)

            # update progress
            print ('Scraping ARTIST: '+song[0]+', SONG: '+song[1])
            clear_output(wait=True)
            sleep(1)

    song_lyrics_df = pd.DataFrame(song_lyrics)
    song_lyrics_df.to_csv('song_lyrics.csv', index=False, header=False)

    song_comments_df = pd.DataFrame(song_comments)
    song_comments_df.to_csv('song_comments.csv', index=False, header=False)
else: 
    song_lyrics_df = pd.read_csv('song_lyrics.csv', header = None)
    song_comments_df = pd.read_csv('song_comments.csv', header = None)