In [13]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import lyricsgenius
import numpy as np
import os
from textblob import TextBlob
import indicoio
import nltk

In [2]:
API_KEY = "0AGGE0X9UYCDMkHxZWHojX0uBIaoHNZCQbfJO8hFx0g7nj9OJYEPJl2NzdBDdgtJ"

In [3]:
SAD_PLAYLIST = "https://open.spotify.com/user/31obw73wcndofulfje4bekzfyccy/playlist/635J1gAVD4vsiqwsyhcfZX?si=FZVQod51TrmQt6CbSavs6A"
HAPPY_PLAYLIST = "https://open.spotify.com/user/31obw73wcndofulfje4bekzfyccy/playlist/6OfKVgmdjZbPfjT8IAJDeY?si=AB1B4YZYQBe-HTUlbGG8Kg"
CLASSICAL_PLAYLIST = "https://open.spotify.com/user/31obw73wcndofulfje4bekzfyccy/playlist/4Qy40eL0wxsmkU7SSgg1QX?si=VlOSwHi3RbKyirDSj4tUvg"

# Data Scraping
pull song titles from spotify, then query them on the genius api

In [14]:
def scrape_titles(query):
    # Navigating to the playlist page
    page = requests.get(query)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Based on the HTML classes on Spotify for Safari and Chrome
    song_titles = pd.Series(soup.find_all(class_ = "track-name"))
    song_titles = song_titles.apply(str)
    html_cleaner = lambda title: re.sub("<.*?>", "", title)
    song_titles = song_titles.apply(html_cleaner)
    return song_titles

def scrape_artists(query):
    page = requests.get(query)
    soup = BeautifulSoup(page.content, 'html.parser')
    artists = pd.Series(soup.find_all(class_ = "artists-albums"))
    artists = artists.apply(lambda title: re.sub("<.*?>", "", str(title)))
    return artists

def clean_artists(artist_name):
    x = artist_name.split("•")[0]
    return re.sub('\s+', ' ', x).strip()

def query_song(title, artist):
    genius = lyricsgenius.Genius(API_KEY)
    song = genius.search_song(title, artist) 
    if song:
        return song.lyrics
    elif artist == "":
        return "no lyrics found"
    else:
        return query_song(title, "")

def playlist_df(url):
    titles = scrape_titles(url)
    artists = scrape_artists(url).apply(clean_artists)
    df = pd.DataFrame({"song": titles, "artist": artists})
    df["lyrics"] = df.apply(lambda x: query_song(x.song, x.artist), axis=1)
    return df

In [5]:
# sad = playlist_df(SAD_PLAYLIST)
# happy = playlist_df(HAPPY_PLAYLIST)

In [6]:
# sad.to_csv("sad.csv")
# happy.to_csv("happy.csv")

In [5]:
# Run this cell to refresh lyrics 
sad = pd.read_csv("sad.csv").iloc[:, 1:4]
happy = pd.read_csv("happy.csv").iloc[:, 1:4]

In [6]:
sad.head()

Unnamed: 0,song,artist,lyrics
0,Night Changes,One Direction,"[Verse 1: Zayn & Liam]\nGoing out tonight, ch..."
1,This Town,Niall Horan,[Verse 1]\nWaking up to kiss you and nobody’s ...
2,Silence,"Marshmello, Khalid","[Verse 1]\nYeah, I'd rather be a lover than a ..."
3,Shouldn't Come Back,Demi Lovato,[Verse 1]\nSee you calling again\nI don't wann...
4,Stargazing,"Kygo, Justin Jesso",[Verse 1: Justin Jesso]\nYou're saying it's ho...


In [7]:
happy.head()

Unnamed: 0,song,artist,lyrics
0,Body Like A Back Road,Sam Hunt,"[Verse 1]\nGot a girl from the south side, got..."
1,Sugar,Maroon 5,[Directed by David Dobkin]\n\n[Verse 1]\nI'm h...
2,All Star,Smash Mouth,[Verse 1]\nSomebody once told me the world is ...
3,Live While We're Young,One Direction,"[Intro]\n\n[Verse 1: Liam & Zayn]\nHey girl, I..."
4,CAN'T STOP THE FEELING! (Original Song from Dr...,Justin Timberlake,no lyrics found


# Data Cleaning
remove punctuation, and lyrical words (not recognized under stopwords)  
detect language, convert to english if necessary  
remove stopwords once all lyrics are in english  

In [21]:
def english(lyric):
    blob = TextBlob(lyric)
    language = blob.detect_language()
    return language == 'en'

def translator(lyric):
    blob = TextBlob(lyic)
    if not english(lyric):
        result = blob.translate(to='en')
        return str(result)
    else:
        return lyric

In [77]:
def standardize(lyrics):
    punc_cleaner = lambda lyric: re.sub(r"[^\w\s']+", '', lyric)
    lyrics = punc_cleaner(lyrics)
    lyrics = str.lower(lyrics)
    return lyrics

def remove_musical_words(lyrics):
    r = "(chorus|hook|intro|verse|bridge|outro|part|ft|feat)"
    lyrics = re.sub(r, "", lyrics)
    return lyrics

In [23]:
def clean(lyric):
    lyric = standardize(lyric)
    lyric = remove_musical_words(lyric)
    return lyric

In [24]:
def drop_records(df):
    df = df.loc[df["lyrics"] != "no lyrics found", :]
    english_indexes = df["lyrics"].apply(english)
    df = df.loc[english_indexes]
    #df = df.loc[df["cleaned"].apply(english), :]
    return df

In [25]:
print("Shape before is {0}".format(happy.shape))
happy["cleaned"] = happy["lyrics"].apply(clean)
happy = drop_records(happy)
print("Shape after is {0}".format(happy.shape))
happy.tail()

Shape before is (30, 4)
Shape after is (24, 4)


Unnamed: 0,song,artist,lyrics,cleaned
22,DJ Got Us Fallin' In Love (feat. Pitbull),"Usher, Pitbull","[Intro: Usher]\nUsher, Usher, Usher\nYeah, man...",usher\nusher usher usher\nyeah man\n\n 1 ushe...
23,Starships,Nicki Minaj,"[Verse 1]\n(RedOne..., Uh)\nLet's go to the be...",1\nredone uh\nlets go to the beach each\nlets...
24,Super Bass,Nicki Minaj,[Verse 1: Nicki Minaj]\nThis one is for the bo...,1 nicki minaj\nthis one is for the boys with ...
25,I Gotta Feeling,The Black Eyed Peas,[Hook]\nI got a feeling that tonight's gonna b...,\ni got a feeling that tonights gonna be a goo...
26,We Are Never Ever Getting Back Together,Taylor Swift,[Verse 1]\nI remember when we broke up the fir...,1\ni remember when we broke up the first time...


In [27]:
print("Shape before is {0}".format(sad.shape))
sad["cleaned"] = sad["lyrics"].apply(clean)
sad = drop_records(sad)
print("Shape after is {0}".format(sad.shape))
sad.tail()

Shape before is (30, 4)
Shape after is (29, 4)


Unnamed: 0,song,artist,lyrics,cleaned
25,Harder To Breathe,Maroon 5,[Verse 1]\nHow dare you say that my behavior's...,1\nhow dare you say that my behaviors unaccep...
26,I Don’t Know Why,Imagine Dragons,[Verse 1]\nWe could be strangers in the night\...,1\nwe could be strangers in the night\nwe cou...
27,Dance with Me Tonight,Olly Murs,"[Intro]\nLadies and Gentlemen, we’ve got a spe...",\nladies and gentlemen weve got a special trea...
28,Sunday Morning,Maroon 5,"[Intro]\nYeah\n\n[Verse 1]\nSunday morning, ra...",\nyeah\n\n 1\nsunday morning rain is falling\n...
29,She Will Be Loved - Radio Mix,Maroon 5,IT MAY NOT BE UNNECESSARY to inform the reader...,it may not be unnecessary to inform the reader...


In [28]:
def load_data():
    happy = pd.read_csv("happy.csv").iloc[:, 1:4]
    #happy["cleaned"] = happy["lyrics"].apply(clean)
    happy = drop_records(happy)
    sad = pd.read_csv("sad.csv").iloc[:, 1:4]
    #sad["cleaned"] = sad["lyrics"].apply(clean)
    sad = drop_records(sad)
    return happy, sad

In [29]:
happy, sad = load_data()

# Making Meaning of the Words
extracting noun phrases  
sentiment extracting  
topic modeling

In [30]:
INDICO_KEY = "345e9dbbeafeed418903dac43945c766"

In [31]:
def add_features(df):
    indicoio.config.api_key = INDICO_KEY
    return df

Topic Modeling

In [32]:
documents = happy["lyrics"]

In [33]:
from sklearn.feature_extraction import text 
addendum = ["yeah", "baby", "tonight", "baby", "gonna", "bout", "'bout", "like",
           "chorus", "hook", "verse", "intro", "bridge", "outro", "pt", "part",
           "wanna", "love", "oh", "ooh"]
stoppy = text.ENGLISH_STOP_WORDS.union(addendum)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.6, min_df=2, 
                             stop_words=stoppy, 
                             ngram_range = (1, 4),
                             token_pattern = "[a-zA-Z]{3,15}"
                             )  
doc_term_matrix = count_vect.fit_transform(documents.values.astype('U'))
doc_term_matrix

<24x1744 sparse matrix of type '<class 'numpy.int64'>'
	with 4195 stored elements in Compressed Sparse Row format>

In [46]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)  
model = LDA.fit(doc_term_matrix)  

In [47]:
import random

for i in range(10):  
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

soon
turning
circumstances
closed
plastic
swear
cylinder
radio
physical
voice


In [48]:
LDA.components_[0]  

array([1.19999975, 4.20000041, 1.19999617, ..., 0.20000007, 0.20000011,
       0.20000016])

In [49]:
for i,topic in enumerate(LDA.components_):  
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['long', 'wayne', 'turns', 'roof', 'hand', 'cops', 'park', 'face', 'night', 'car']


Top 10 words for topic #1:
['beat', 'time', 'hear', 'alright', 'song', 'body', 'young', 'crazy', 'live', 'let']


Top 10 words for topic #2:
['time', 'west', 'right', 'east', 'heart', 'high', 'set', 'way', 'let', 'life']


Top 10 words for topic #3:
['destiny', 'beat', 'took', 'body', 'beauty', 'eyes', 'aye', 'shut', 'dance', 'said']


Top 10 words for topic #4:
['rock', 'think', 'come', 'bass', 'cause', 'good', 'got', 'let', 'night', 'boom']




In [50]:
g = enumerate(LDA.components_)
for i, topic in g:
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])

['time', 'hall', 'black', 'right', 'city', 'red', 'got', 'door', 'open', 'look', 'long', 'wayne', 'turns', 'roof', 'hand', 'cops', 'park', 'face', 'night', 'car']
['whoa', 'mon', 'stop let', 'let live', 'party', 'stop', 'waitin', 'hell', 'weird', 'want', 'beat', 'time', 'hear', 'alright', 'song', 'body', 'young', 'crazy', 'live', 'let']
['worth', 'stay', 'south', 'air', 'big', 'gone', 'real', 'make', 'check', 'mind', 'time', 'west', 'right', 'east', 'heart', 'high', 'set', 'way', 'let', 'life']
['bound', 'slow just', 'slow', 'ain', 'got', 'said don', 'woman', 'dare', 'look', 'holding', 'destiny', 'beat', 'took', 'body', 'beauty', 'eyes', 'aye', 'shut', 'dance', 'said']
['let let', 'make', 'danced', 'stop', 'yes', 'need', 'friday', 'life', 'say', 'little', 'rock', 'think', 'come', 'bass', 'cause', 'good', 'got', 'let', 'night', 'boom']


In [56]:
# method to build a topic corpus
def give_topic_models(documents):
    from sklearn.feature_extraction import text 
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation
    from nltk.corpus import stopwords
    addendum = ["yeah", "baby", "tonight", "baby", "gonna", "bout", "'bout", "like",
               "chorus", "hook", "verse", "intro", "bridge", "outro", "pt", "part",
               "wanna", "love", "oh", "ooh"]
    
    stoppy = stopwords.words('english') + addendum
    count_vect = CountVectorizer(max_df=0.6, min_df=2, stop_words=stoppy, ngram_range = (1, 4), token_pattern = "[a-zA-Z]{3,15}")  
    doc_term_matrix = count_vect.fit_transform(documents.values.astype('U'))
    LDA = LatentDirichletAllocation(n_components=5, random_state=42)  
    model = LDA.fit(doc_term_matrix)  
    it, result = enumerate(LDA.components_), []
    for i, topic in it:
        words = [count_vect.get_feature_names()[i] for i in topic.argsort()[-20:]]
        result.append(words)
    return result

clearly, an issue is that the models are including arbitrary words, so it might make sense to only keep nouns

looks like there's a lot of garbage filler words to get rid of

In [54]:
give_topic_models(happy["lyrics"])[0]

['time',
 'hall',
 'black',
 'right',
 'city',
 'red',
 'got',
 'door',
 'open',
 'look',
 'long',
 'wayne',
 'turns',
 'roof',
 'hand',
 'cops',
 'park',
 'face',
 'night',
 'car']

In [55]:
give_topic_models(sad["lyrics"])[0]

['want',
 'right',
 'away',
 'august',
 'july',
 'really',
 'june',
 'time',
 'come',
 'feel',
 'april',
 'young',
 'make',
 'let',
 'bad',
 'girl',
 'night',
 'won',
 'way',
 'just']

In [84]:
def extract_nouns(original):
    original = standardize(original).split(" ")
    original = [word for word in original if word not in stoppy]
    original = [word for word in original if len(word) > 3]
    original = " ".join(original)
    blob = TextBlob(original)
    result = blob.noun_phrases
    result = [str(g) for g in result]
    #result = " ".join(result)
    return result

In [85]:
happy["lyrics"].apply(extract_nouns)

0     [braids hair, cadillac seats chorus body road ...
1     [david dobkin verse, i 'm, i need, knees prech...
2     [world roll, i ai n't sharpest tool, dumb fing...
3     [intro verse liam zayn hey girl, celebration c...
5                                  [na na, good day na]
6     [screenplay hamm, hard copy script, scene numb...
9     [intro high high, high high hopes, n't dime vi...
10    ['d flight, 'd time zone, rainbows idea, crazy...
11    [zayn harry oh, night baby i 'll, baby i 'll y...
12                            [lyrics song, check song]
13    [days torture, 's california dime, 's time, n'...
14    [impossible hierarchy helper nepal corns ma us...
15    [straight heart, doors past guards, prechorus ...
16    [intro chorus oh, n't dare look, shut dance, w...
17    [adam levine travie mccoy, heart 's stereo, st...
18    [nicki minaj yeah, young money nicki minaj jus...
19    ['s stranger bed, head glitter room, pink flam...
20    [verbs ai n't talkin, dine dine, lean kiss

In [86]:
sad["lyrics"].apply(extract_nouns)

0     [zayn liam, n't kind dress, wo n't, chorus har...
1     [kiss nobodys, funny things, stars prechorus, ...
2     [yeah lover fighter 'cause life i 've, feeling...
3     [oh i, ohoh prechorus, sorry sorry times, n't ...
4     [justin jesso, 's hopeless hope, 've meteoric,...
5     [minute stone, cold sober i, ya i, rest chorus...
6     [goodbye plane, heart chorus, goodbye times, c...
7     [n't strength, chorus wake, middle night, i wa...
8     [ta 'cause, 'cause edge nervous breakdown 'cau...
9     [oh yeah verse, wine loosen, 's fly, nasty loo...
10    [i 'll, fast prechorus, sleep 'cause, wake i s...
11    [insides feeling hollow, hard pill swallow yea...
12    [intro oh yeah oh yeah verse, wo n't bend, i '...
13    [oh oh words mend things, night sleep, night i...
14    [black town i, 's stupid, chorus i 'm corner, ...
15    [516pm mst octoberoctober 6jesse powell jesse ...
16    [oh eyes eyes, 're shinin', hair hair, day yea...
17    [eyes relax, heavy thoughts, time verse, g

In [92]:
import collections
import seaborn as sns
import matplotlib.pyplot as plt

a = happy["lyrics"].apply(standardize).apply(extract_nouns)

a = "".join(happy["lyrics"].tolist())

wordcount = {}
# split this
for word in a.split():
    word = standardize(word)
    if word not in stoppy:
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1

word_counter = collections.Counter(wordcount)
for word, count in word_counter.most_common(25):
    print(word, ": ", count)

 :  1319
life :  376
bruce :  317
it's :  201
vicki :  186
moment :  177
just :  174
int :  166
night :  165
selina :  148
alfred :  146
penguin :  139
know :  137
batman :  137
way :  130
got :  119
don't :  118
i'm :  111
let :  105
tiptree :  102
time :  95
right :  95
car :  95
set :  89
good :  84
