In [9]:
import urllib2
from bs4 import BeautifulSoup as BS
import pandas as pd
import os.path

In [2]:
def generate_song_list():
    # returns random list of links to song's webpage on tanzmusik-online
    
    # get main webpage 
    music = "https://www.tanzmusik-online.de/random"
    music_page = urllib2.urlopen(music)
    music_soup = BS(music_page)
    # get the songs listed
    song_list=music_soup.find("div", class_="songlist")
    # get all links to song titles
    all_songs = song_list.find_all("div", class_="songTitle")
    links = []
    for song in all_songs:
        link = song.find("a").get("href")
        links.append(link)
    return links

In [3]:
def get_label_and_youtube_link(song):
    # returns the specified dance style as label and the youtube link to the song
    # get the song's webpage
    # make sure language is set to english (for string comparisons)
    song_request = urllib2.Request(song, headers={"Accept-Language": "en"})
    song_page = urllib2.urlopen(song_request)
    # song_page = urllib2.urlopen('https://www.tanzmusik-online.de/locale/en')
    song_soup = BS(song_page)
    expert = False
    # check whether song has been verified (only add songs where GT is correct)
    all_pull_right = song_soup.find_all("div", class_="pull-right")

    for pl in all_pull_right:
        if pl.find("span") is not None:
            # since website may be confirmed by visitor or expert
            if "confirmed by" in pl.find("span").string:
                expert = True
                break
    if expert:
        # get the links to associated pages
        all_modes = song_soup.find_all("span", class_="modelink")
        for mode in all_modes:
            link = mode.find("a").get("href")
            # only continue if associated page is youtube
            if "youtube" in link:
                labels = []
                # get the assigned dances, might either be multiple divs
                # or multiple links in a single div
                all_dances = song_soup.find_all("div", class_="dances")
                for ad in all_dances:
                    dances = ad.find_all("a")
                    for dance in dances:
                        labels.append(dance.string)
                # since we expect only one youtube link, we can return here
                return link, labels
    else:
        print("no style confirmation for", song)
    # in case no youtube link was found or labels were not confirmed, we will not return any relevant info

In [4]:
# visitor approved link
get_label_and_youtube_link('https://www.tanzmusik-online.de/music/2am-club/title/not-your-boyfriend')

('http://youtube.com/watch?v=EdmoxnTpW9o', [u'Disco Fox'])

In [5]:
# expert approved link
get_label_and_youtube_link('https://www.tanzmusik-online.de/music/ray-collins-hot-club/title/sweet-little-love')

('http://youtube.com/watch?v=-F0KzoXG9v8', [u'Jive'])

In [6]:
# not approved link
get_label_and_youtube_link('https://www.tanzmusik-online.de/music/woodhouse-jazzband/title/almost-like-being-in-love')

('no style confirmation for', 'https://www.tanzmusik-online.de/music/woodhouse-jazzband/title/almost-like-being-in-love')


In [4]:
# generate dance music samples
links = []
for i in range(100):
    links.extend(generate_song_list())
# remove any duplicates
links = set(links)

In [6]:
# get youtubelinks and dance type labels
song_list = []
for song in links:
    # for each song get the labels and a youtube link
    return_values = get_label_and_youtube_link(song)
    # if relevant information is provided
    if return_values is not None:
        # for each label add a row with the youtube link
        for irv in range(len(return_values[1])):
            song_list.append([return_values[0], return_values[1][irv]])
labels = pd.DataFrame(song_list, columns=["youtube_link", "label"])

('no style confirmation for', 'https://www.tanzmusik-online.de/music/katie-melua/title/the-cry-of-the-lone-wolf')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/toten-hosen/title/altes-fieber')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/scott-bradlee-postmodern-jukebox/title/hollaback-girl')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/zarah-leander/title/ich-weiss-es-wird-einmal-ein-wunder-geschehn')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/scott-bradlee-postmodern-jukebox/title/teenage-dirtback')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/cornell-hurd-band/title/happy-hour-in-hell')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/volbeat/title/the-bliss')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/the-corrs/title/road-to-eden-1')
('no style confirmation for', 'https://www.tanzmusik-online.de/music/cat-ballou/title

In [18]:
# put id and youtube link into separate data frame to be able to add relevant information later
song_info = pd.DataFrame(labels["youtube_link"].unique(), columns=["youtube_link"])
song_info["id"] = song_info["youtube_link"].str.split("=").str[1]

In [19]:
# instead of link, save id with labels, only
labels["id"] = labels["youtube_link"].str.split("=").str[1]
labels = labels.drop("youtube_link", axis=1)

In [21]:
# if a song list has previously been generated add the two lists
if os.path.isfile("labels_list.csv"):
    # only if both files exist, the information can be reused
    old_labels = pd.read_csv("labels_list.csv")
    old_labels = old_labels.drop("Unnamed: 0", axis=1)
    old_info = pd.read_csv("song_info_list.csv")
    old_info = old_info.drop("Unnamed: 0", axis=1)
    # merge the data frames
    labels = pd.concat([old_labels, labels])
    song_info = pd.concat([old_info, song_info])
# save lists
labels.to_csv("labels_list.csv")
song_info.to_csv("song_info_list.csv")

