In [5]:
# Import all the things
import nltk
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import os
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import RegexpTokenizer

from spacy.lang.en import English
import spacy
from spacy import displacy
from collections import Counter

# Apply Gensim topic modelling to dylan lyrics:
from gensim import models, corpora

In [3]:
stop_chords = ["i'm","like", "well","well,", "got", "", "know", "ain't", "get", "em","oh,", "s", "t", "d", "ll"]
stop_words = stopwords.words("english")   
stop_words.extend(stop_chords)

In [2]:
def read_file(filename):
    song = BeautifulSoup(open(filename), 'html.parser')
    song_info = get_song_information(song)
    return song_info

def get_song_information(song):
    song_info = {}
    song_name = song.title.text
    album_name = song.find('a', {'class' : 'recordlink'}).text.strip()
    lyrics = song.find('pre', {'class' : 'verse'}).text.strip()
    lyrics = clean_up_lyrics(lyrics)
    song_info['song'] = song_name
    song_info['album'] = album_name
    song_info['lyrics'] = lyrics
    
    return song_info

def clean_up_lyrics(lyrics):
    lyrics = lyrics.replace('\n', " ") # Remove newline character
    lyrics = re.sub("\s+", " ", lyrics) # Remove all whitespace between the lyrics
    lyrics = toLower(lyrics)
    #lyrics = removeStopWords(lyrics)
    return lyrics

def toLower(lyrics):
    words = []
    for w in lyrics:
        words.append(w.lower())
    return "".join(words)

def removeStopWords(words):
    words = words.split(" ")
    
    ns_words = []
    for word in words:
        if word not in stop_words:
            ns_words.append(word)
    return " ".join(ns_words)

In [3]:
def get_all_song_paths():
    all_files = []
    for(dirpath, dirnames, filenames) in os.walk('./chords/'):
        for filename in filenames:
            if(filename.endswith('.htm') & (not "index" in filename)):
                all_files.append(os.sep.join([dirpath, filename]))
    return all_files

In [6]:
all_files = get_all_song_paths()
all_song_info = []

for file in all_files:
    song = read_file(file)
    all_song_info.append(song) 

In [7]:
df = pd.DataFrame(all_song_info, columns=["song", "album", "lyrics"])

df.to_pickle('../quora_sentiment/dylan/dylan_corpus.pkl')

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
df['tokens'] = df['lyrics'].apply(tokenizer.tokenize)

dylan_eras = {
    '1': ["Bob Dylan","Freewheelin'","The Times They Are A-changin'", "Another Side Of Bob Dylan"],
    '2': ["Highway 61 Revisited","Bringing It All Back Home","Blonde on Blonde"],
    '3': ["Nashville Skyline","Self Portrait","New Morning","Pat Garret & Billy The Kid","John Wesley Harding"],
    '4': ["Planet Waves","Dylan","Blood on the Tracks","Desire"]
}

def get_dylan_era(album):
    for k,v in dylan_eras.items():
        if(album in v):
            return k

df['era'] = df['album'].apply(lambda x: get_dylan_era(x))
df.head()

In [None]:
non_words = ['i','s','ll','a','ve','d','t'] #weird tokens.
all_words = [word for tokens in df['tokens'] for word in tokens if word not in non_words]
sentence_lengths = [len(tokens) for tokens in df['tokens']]
VOCAB = sorted(list(set(all_words)))

In [None]:
count_all_words = Counter(all_words)
count_all_words.most_common(30)

In [None]:
NUM_TOPICS = 10
data = []
for l in all_song_info:
    data.append(l['lyrics'])

def clean_text(text):
    tokenised_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenised_text if t not in stop_words and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [None]:
tokenised_data = []
for text in data:
    tokenised_data.append(clean_text(text))

In [None]:
dictionary = corpora.Dictionary(tokenised_data)

corpus = [dictionary.doc2bow(text) for text in tokenised_data]

lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [None]:
for idx in range(NUM_TOPICS):
    print("Topic #%s" % idx, lsi_model.print_topic(idx, 10))

In [None]:
# We can perform similarity queries with gensim
from gensim import similarities
# Get unseen lyric to match the topic:
sent = """
Now, boys, don't start to ramble round,
On this road of sin or you're sorrow bound.
And you'll get lost, you'll curse the day
You started rollin' down that lost highway.

I'm a rolling stone, all alone and lost,
For a life of sin, I have paid the cost.
When I pass by, you'll curse the day
You started rollin' down that lost highway.
"""

bow = dictionary.doc2bow(clean_text(sent))

lda_index = similarities.MatrixSimilarity(lda_model[corpus])

similarities = lda_index[lda_model[bow]]

similarities = sorted(enumerate(similarities), key= lambda item: -item[1])

document_id, similarity = similarities[0]

print(data[document_id][:1000])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
res = vectorizer.fit_transform(df['lyrics'])
idf = vectorizer.idf_
print(dict(zip(vectorizer.get_feature_names(), idf)))