In [1]:
import os
import re
import string
import random
import pickle

#### Set loadFromPickle to True if using the pickled corpus

In [2]:
# Defines
loadFromPickle = True

pathToCorpus = './pickles/corpus.pkl'
pathToLyrics = './lyrics/'
tokenPattern = "\\b[a-zA-Z]+\\b"
regex = re.compile(tokenPattern)

In [3]:
if loadFromPickle:
    corpus = pickle.load(open(pathToCorpus, "rb"))
else:
    corpus = []
    # loop through artist subfolders in lyrics folder
    for subdir, dirs, files in os.walk(pathToLyrics):
        if subdir == pathToLyrics:
            continue

        for file in files:
            try:
                fh = open(subdir + '/' + file, "r")
                s = fh.read()
                fh.close()
            except:
                continue  # skip if error opening file

            lyrics = s[s.index('\n') + 1:]  # drop song title
            stanza = ""
            for line in lyrics.splitlines():
                # preprocess words and rebuild line
                line = line.translate({ord(char):'' for char in string.punctuation}).lower()
                words = regex.findall(line)
                stanza += ' '.join(words) + '\n'
                if len(words) == 0:  # this is a blank line, meaning end of stanza
                    corpus.append(stanza)
                    stanza = ""
            if stanza != "":  # add song's last stanza
                corpus.append(stanza)
    # save corpus
    pickle.dump(corpus, open(pathToCorpus, "wb"))

In [4]:
def buildMarkov(corpus, markov):
    """
    Populates states and transitions of the bigram markov model with corpus data (a list of lyric stanzas)
    """

    for stanza in corpus:
        words = ["__START__", "__START__"] + stanza.replace("\n", " __NL__ ").split()
        words[-1] = "__END__"  # replace last newline char with stanza END tag
        for i in range(2, len(words)):
            state = (words[i-2], words[i-1])
            if state not in markov:
                markov[state] = [words[i]]
            else:
                markov[state].append(words[i])

In [5]:
def generate(markov, maxWords=50):
    """
    Generates a stanza of lyrics using the bigram markov model
    """

    words = ["__START__", "__START__"]
    nxt = ""
    while nxt != "__END__" and len(words) < (maxWords + 2):
        state = (words[-2], words[-1])
        nxt = random.choice(markov[state])
        words.append(nxt)
    return " " + " ".join(words[2:-1]).replace("__NL__", "\n")

In [6]:
markov = {}
buildMarkov(corpus, markov)

#### Run below cell to generate a stanza of modern pop lyrics 

In [7]:
print(generate(markov))

 jigga jigga 
 thats all they know 
 and im like baby baby 
 you tried to find out how you sigh in my socks 
 music in me loves the best of me 
 burns like the way she dances 
 i fall 

