In [None]:
"""

Description: Natural Language processing of Boogie Woogie musician bio data from Wikipedia. Objective is to
build an LSI model of ~90 or so 'Boogie Woogie' musicians, and then feed in non Boogie Woogie musicians,
and see which artist they most closely align with. 

Info on the model here: https://en.wikipedia.org/wiki/Latent_semantic_analysis


"""

In [76]:
import sys
import nltk
import os
import _sqlite3
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize,word_tokenize 
from gensim import corpora, models, similarities
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.similarities.docsim import Similarity
import pandas as pd
import requests
from bs4 import BeautifulSoup

"""
Question 1

Write a function that takes the file name of a page with a list of musicians as an input, 
and returns a list of URLs that point to articles on each musician in the list.
 
"""


def get_musicians():
    
    url = "https://en.wikipedia.org/wiki/List_of_boogie_woogie_musicians"
    response = requests.get(url)
    soup = BeautifulSoup(response.content,'lxml')
    url_list =[]
    li_tags = soup.find_all("li",class_=False)
    base_url = "https://en.wikipedia.org"
    
    for tag in li_tags:
        href_val = (tag.find('a')['href'])
        new_val =  base_url + href_val
        if href_val == "/wiki/ZZ_Top":   ###ZZ top is the last valid artist href record##
            break
        else:
            url_list.append(new_val)
    
    return(url_list)

In [84]:
"""
Question 2

Write a function that scrapes the text on a musician’s page and returns it as a text 
string. The input is the name of the file that contains the musician's page.

Example of use: get_text('carolinedahl.html')
should output the text of the page.
'Caroline Dahl is an American pianist and composer...'
"""

def get_text(artist_page):
    #
    # YOUR CODE HERE
    #
    filename = "%s.txt" % artist_page.split("wiki/",1)[1]     
    artist_data = "%s" % artist_page.split("wiki/",1)[1]     
    response = requests.get(artist_page)
    soup = BeautifulSoup(response.content,'lxml')
    text =""
    for tag in soup.find_all('p'):
        text += tag.get_text() 

    text = text.replace('\n','')
    f = open('artists/'+filename , 'w')
    f.write(text)
        


In [85]:
# Create the artist text files
url_list = get_musicians()

for artist in url_list:
    get_text(artist)

In [158]:
## Create dict of artist:plaintextcorpus objects based on text docs -- must use txt files for PlaintextCorpusReader!

pairs = OrderedDict()
files = [i for i in os.listdir("artists/") if i.endswith("txt")]
for file in files:
    artist_data  = PlaintextCorpusReader('artists/',file)
    pairs[file] = artist_data
    

In [159]:
## Add plaintextcorpus objects to documents list as raw and new_pairs dict as raw objects, excluding faulty keys 
from collections import OrderedDict 

documents = []
faulty_keys = [] 
new_pairs = OrderedDict()
        
for key,val in pairs.items():
    try:
        new_pairs[key]=val.raw()
        documents.append(val.raw())
    except:
        faulty_keys.append(key)
        
        continue



In [None]:
#retrieve samples data 
#for artist in sample_list:
#   get_text(artist)    

sample_list = ['https://en.wikipedia.org/wiki/Melvin_Sparks','https://en.wikipedia.org/wiki/Roosevelt_Sykes',
               'https://en.wikipedia.org/wiki/Billy_Eckstine','https://en.wikipedia.org/wiki/Jimmy_Witherspoon']
samples_dict = OrderedDict()
sample_files = [i for i in os.listdir("artists/samples") if i.endswith("txt")]
       
for file in sample_files:
    with open("artists/samples/" + file, "r") as file:
        contents =file.read()
    samples_dict[file.name]=contents

In [224]:
"""
Question 3

Use the Learning Sample in the samples folder to construct an LSI model. 
Then, use the LSI model to find the most similar musician from the learning sample 
(folder 'samples') for each musician in the test sample (folder 'test'). Then, save in a CSV
the name of a musician from the test sample, and the name of the most similar musician 
from the learning sample.

Example of output (don't forget the header):

TestName,SampleName
MelvinSparks,HenryGraymusician
RooseveltSykes,RooseveltSykes
BillyEckstine,FatsDomino
JimmyWitherspoon,MoonMullican
"""

def build_corp_list():

    matches_list=[]
    pair_list = []

    #create the corpus and the lsi model
    texts = [[word for word in document.lower().split()
            if word not in STOPWORDS and word.isalnum()]
            for document in new_pairs.values()]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    
    for x,y in samples_dict.items():
        #use the sample docs as inputs to compare to the corpus of musicians
        vec_bow = dictionary.doc2bow(y.lower().split())
        vec_lsi = lsi[vec_bow]
        index = similarities.MatrixSimilarity(lsi[corpus])
        sims = index[vec_lsi]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        
        #listelem will be leveraged to tie back to the musician ordered dict new_pairs
        listelem = (sims[0][0])
        converted_corp_artist = list(new_pairs.keys())[listelem]
        
        #clean up the outputs
        converted_corp_artist = converted_corp_artist.split(".",1)[0]
        samp = x
        samp = samp.split(".",1)[0] 
        samp = samp.split("samples/",1)[1] 

        #append each sample and it's closest match to a list of lists
        pair_list=[converted_corp_artist,samp]
        matches_list.append(pair_list)

    
    ##write this list of lists as a csv 
    df = pd.DataFrame(matches_list,columns=['Artist Match', 'Sample'])
    df.to_csv('matches.csv', index=False, header=['Artist Match', 'Sample'])
    
    return df

In [229]:
#Execute above function to get the answer. Roosevelt Sykes matching to herself, as a sanity check.

build_corp_list()

Unnamed: 0,Artist Match,Sample
0,Piano_Red,Billy_Eckstine
1,Albert_Ammons,Jimmy_Witherspoon
2,Huey_Piano_Smith,Melvin_Sparks
3,Roosevelt_Sykes,Roosevelt_Sykes
