In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import requests
from bs4 import BeautifulSoup

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url = 'https://delwatergap.net/Lyrics-1'
page = requests.get(url).content

In [3]:
bs = BeautifulSoup(page)

In [4]:
a = bs.find('div',attrs={'class':'page_content clearfix'}).find_all('a')[1:]
song_titles = [x['href'] for x in a]

In [5]:
song_titles.remove('High-Tops-Stripped')

In [6]:
song_titles

['Be-My-Own-Bonus-Track',
 'Chastain',
 'Cut-The-Rope',
 'Deirdre-Pt-I',
 'Don-t-Let-Me',
 'Don-t-Read-The-Mirror',
 'Don-t-Say-Nothing',
 'High-Tops',
 'Homeless',
 'I-Am-Drunk-And-She-Is-Insane',
 'In-The-Yard',
 'Laid-Down-My-Arms',
 'Lamplight',
 'Let-s-Pretend',
 'Lost-My-Cat-Put-in-a-Cage',
 'Love-Song-For-Lady-Earth-Sasha',
 'Rockman-s-Pier',
 'Theory-of-Emotion',
 'To-Philly',
 'Still-in-Love',
 'Vanessa']

In [7]:
song_dicts = [{'title':song} for song in song_titles]

In [8]:
songs_df = pd.DataFrame(song_dicts)

In [9]:
songs_df['url'] = 'https://delwatergap.net/'+songs_df['title'] 

In [10]:
def get_lyrics(url):
    return ([x.get_text().replace('\n',' ').replace('\u2028', ' ') 
             for x in BeautifulSoup(requests.get(url).content).find_all('h2')])

In [11]:
songs_df['lyrics'] = songs_df['url'].map(lambda x: get_lyrics(x))

In [12]:
songs_df.set_index('title',inplace=True)

In [13]:
songs_df.loc['Sasha']=None

In [14]:
songs_df

Unnamed: 0_level_0,url,lyrics
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Be-My-Own-Bonus-Track,https://delwatergap.net/Be-My-Own-Bonus-Track,[We never sleep at night But we’ll spend the d...
Chastain,https://delwatergap.net/Chastain,"[ Man, I just wanna live a day as a blonde ..."
Cut-The-Rope,https://delwatergap.net/Cut-The-Rope,[How does it feel When you break a heart Does ...
Deirdre-Pt-I,https://delwatergap.net/Deirdre-Pt-I,[I once met a girl at a bar I know She drank h...
Don-t-Let-Me,https://delwatergap.net/Don-t-Let-Me,[Lord don't let me wake at night Picturing the...
Don-t-Read-The-Mirror,https://delwatergap.net/Don-t-Read-The-Mirror,"[ Sophie was a tall girl, held me from acros..."
Don-t-Say-Nothing,https://delwatergap.net/Don-t-Say-Nothing,[When you’re tired I’ll take it slow till you ...
High-Tops,https://delwatergap.net/High-Tops,[Don’t you know that you’re the only queen Of ...
Homeless,https://delwatergap.net/Homeless,[You’re in bed but I can’t sleep The homeless ...
I-Am-Drunk-And-She-Is-Insane,https://delwatergap.net/I-Am-Drunk-And-She-Is-...,[Well the further I go it’s the older I get An...


In [15]:
# Split Sasha and Love Song for Lady Earth
songs_df.loc['Sasha']['lyrics'] = [songs_df.loc['Love-Song-For-Lady-Earth-Sasha']['lyrics'][0].split('We got')[1]]

In [16]:
songs_df.loc['Love-Song-For-Lady-Earth']=None
songs_df.loc['Love-Song-For-Lady-Earth']['lyrics'] = [songs_df.loc['Love-Song-For-Lady-Earth-Sasha']['lyrics'][0].split('We got')[0]]

In [17]:
songs_df

Unnamed: 0_level_0,url,lyrics
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Be-My-Own-Bonus-Track,https://delwatergap.net/Be-My-Own-Bonus-Track,[We never sleep at night But we’ll spend the d...
Chastain,https://delwatergap.net/Chastain,"[ Man, I just wanna live a day as a blonde ..."
Cut-The-Rope,https://delwatergap.net/Cut-The-Rope,[How does it feel When you break a heart Does ...
Deirdre-Pt-I,https://delwatergap.net/Deirdre-Pt-I,[I once met a girl at a bar I know She drank h...
Don-t-Let-Me,https://delwatergap.net/Don-t-Let-Me,[Lord don't let me wake at night Picturing the...
Don-t-Read-The-Mirror,https://delwatergap.net/Don-t-Read-The-Mirror,"[ Sophie was a tall girl, held me from acros..."
Don-t-Say-Nothing,https://delwatergap.net/Don-t-Say-Nothing,[When you’re tired I’ll take it slow till you ...
High-Tops,https://delwatergap.net/High-Tops,[Don’t you know that you’re the only queen Of ...
Homeless,https://delwatergap.net/Homeless,[You’re in bed but I can’t sleep The homeless ...
I-Am-Drunk-And-She-Is-Insane,https://delwatergap.net/I-Am-Drunk-And-She-Is-...,[Well the further I go it’s the older I get An...


In [18]:
songs_df.loc['Sasha']['url']=songs_df.loc['Love-Song-For-Lady-Earth-Sasha']['url']
songs_df.loc['Love-Song-For-Lady-Earth']['url']=songs_df.loc['Love-Song-For-Lady-Earth-Sasha']['url']

In [19]:
songs_df.drop(index='Love-Song-For-Lady-Earth-Sasha',inplace=True)

In [20]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

songs_df['data_words'] = [x[:-2] for x in list(sent_to_words(songs_df['lyrics']))]

In [21]:
songs_df

Unnamed: 0_level_0,url,lyrics,data_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Be-My-Own-Bonus-Track,https://delwatergap.net/Be-My-Own-Bonus-Track,[We never sleep at night But we’ll spend the d...,"[we, never, sleep, at, night, but, we, ll, spe..."
Chastain,https://delwatergap.net/Chastain,"[ Man, I just wanna live a day as a blonde ...","[man, just, wanna, live, day, as, blonde, ll, ..."
Cut-The-Rope,https://delwatergap.net/Cut-The-Rope,[How does it feel When you break a heart Does ...,"[how, does, it, feel, when, you, break, heart,..."
Deirdre-Pt-I,https://delwatergap.net/Deirdre-Pt-I,[I once met a girl at a bar I know She drank h...,"[once, met, girl, at, bar, know, she, drank, h..."
Don-t-Let-Me,https://delwatergap.net/Don-t-Let-Me,[Lord don't let me wake at night Picturing the...,"[lord, don, let, me, wake, at, night, picturin..."
Don-t-Read-The-Mirror,https://delwatergap.net/Don-t-Read-The-Mirror,"[ Sophie was a tall girl, held me from acros...","[sophie, was, tall, girl, held, me, from, acro..."
Don-t-Say-Nothing,https://delwatergap.net/Don-t-Say-Nothing,[When you’re tired I’ll take it slow till you ...,"[when, you, re, tired, ll, take, it, slow, til..."
High-Tops,https://delwatergap.net/High-Tops,[Don’t you know that you’re the only queen Of ...,"[don, you, know, that, you, re, the, only, que..."
Homeless,https://delwatergap.net/Homeless,[You’re in bed but I can’t sleep The homeless ...,"[you, re, in, bed, but, can, sleep, the, homel..."
I-Am-Drunk-And-She-Is-Insane,https://delwatergap.net/I-Am-Drunk-And-She-Is-...,[Well the further I go it’s the older I get An...,"[well, the, further, go, it, the, older, get, ..."


In [22]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [23]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
songs_df['data_lemmatized'] = lemmatization(songs_df['data_words'], allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [24]:
songs_df

Unnamed: 0_level_0,url,lyrics,data_words,data_lemmatized
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Be-My-Own-Bonus-Track,https://delwatergap.net/Be-My-Own-Bonus-Track,[We never sleep at night But we’ll spend the d...,"[we, never, sleep, at, night, but, we, ll, spe...",never sleep night will spend day bed walk stai...
Chastain,https://delwatergap.net/Chastain,"[ Man, I just wanna live a day as a blonde ...","[man, just, wanna, live, day, as, blonde, ll, ...",just day will look get mistake man job take se...
Cut-The-Rope,https://delwatergap.net/Cut-The-Rope,[How does it feel When you break a heart Does ...,"[how, does, it, feel, when, you, break, heart,...",how feel when break heart start peel weak part...
Deirdre-Pt-I,https://delwatergap.net/Deirdre-Pt-I,[I once met a girl at a bar I know She drank h...,"[once, met, girl, at, bar, know, she, drank, h...",once meet girl bar know drink sleep dinner coa...
Don-t-Let-Me,https://delwatergap.net/Don-t-Let-Me,[Lord don't let me wake at night Picturing the...,"[lord, don, let, me, wake, at, night, picturin...",let wake night picture hand may hold re far pa...
Don-t-Read-The-Mirror,https://delwatergap.net/Don-t-Read-The-Mirror,"[ Sophie was a tall girl, held me from acros...","[sophie, was, tall, girl, held, me, from, acro...",tall girl hold room ask name tell difference c...
Don-t-Say-Nothing,https://delwatergap.net/Don-t-Say-Nothing,[When you’re tired I’ll take it slow till you ...,"[when, you, re, tired, ll, take, it, slow, til...",when re tired will take slow fall place start ...
High-Tops,https://delwatergap.net/High-Tops,[Don’t you know that you’re the only queen Of ...,"[don, you, know, that, you, re, the, only, que...",know re only queen horse town re only mean thi...
Homeless,https://delwatergap.net/Homeless,[You’re in bed but I can’t sleep The homeless ...,"[you, re, in, bed, but, can, sleep, the, homel...",re bed can sleep homeless man street juice scr...
I-Am-Drunk-And-She-Is-Insane,https://delwatergap.net/I-Am-Drunk-And-She-Is-...,[Well the further I go it’s the older I get An...,"[well, the, further, go, it, the, older, get, ...",further go old find alone when drink hard when...


In [25]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=.1,                        # minimum reqd occurences of a word 
                             max_df=.9,
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(songs_df['data_lemmatized'])

In [26]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=2,           # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=2, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [27]:
# Define Search Param
search_params = {'n_components': [2, 3, 4,], 
                 'learning_decay': [.6, .7, .8],
                 'max_iter':[6,7,8],
                 'batch_size':[2,3,4,5]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [28]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'batch_size': 2, 'learning_decay': 0.8, 'max_iter': 6, 'n_components': 2}
Best Log Likelihood Score:  -1809.7441447494964
Model Perplexity:  93.09770608275869


In [29]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = songs_df.index

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0_level_0,Topic0,Topic1,dominant_topic
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Be-My-Own-Bonus-Track,0.97,0.03,0
Chastain,0.86,0.14,0
Cut-The-Rope,0.69,0.31,0
Deirdre-Pt-I,0.3,0.7,1
Don-t-Let-Me,0.09,0.91,1
Don-t-Read-The-Mirror,0.99,0.01,0
Don-t-Say-Nothing,0.98,0.02,0
High-Tops,0.01,0.99,1
Homeless,0.82,0.18,0
I-Am-Drunk-And-She-Is-Insane,0.04,0.96,1


In [30]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,0,14
1,1,8


In [31]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [32]:
pyLDAvis.save_html(panel,'/Users/schlinkertc/code/NLP_practice/topics.htm')

In [33]:
df = songs_df.merge(df_document_topic,
              left_on=songs_df.index,
              right_on='title')

In [34]:
df['Topic'] = df['dominant_topic'].map({0:1,1:2})

In [35]:
df['Liklihood'] = df['Topic0'].map(lambda x: x if x>.5 else 1-x)

In [36]:
df.columns

Index(['title', 'url', 'lyrics', 'data_words', 'data_lemmatized', 'Topic0',
       'Topic1', 'dominant_topic', 'Topic', 'Liklihood'],
      dtype='object')

In [41]:
unwanted_columns = ['url','data_words','Topic0',
                    'Topic1', 'dominant_topic','lyrics',
                   'data_lemmatized']
summary = df.drop(columns=unwanted_columns).set_index('Topic').sort_index()

KeyError: "['url' 'data_words' 'Topic0' 'Topic1' 'dominant_topic' 'lyrics'\n 'data_lemmatized'] not found in axis"

In [44]:
df

Unnamed: 0_level_0,title,Liklihood
Topic,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Be-My-Own-Bonus-Track,0.97
1,Vanessa,0.71
1,Still-in-Love,0.98
1,To-Philly,0.98
1,Lost-My-Cat-Put-in-a-Cage,0.63
1,Lamplight,0.99
1,Laid-Down-My-Arms,0.99
1,Sasha,0.98
1,In-The-Yard,0.99
1,Don-t-Say-Nothing,0.98
