# Preprocessing pipeline
In this notebook, we will clean and agglomerate the data acquired from several platforms to be used conveniently for analysis.

In [93]:
import pandas as pd
import os
import glob
import urllib
import requests
import time
import json
from pandas.io.json import json_normalize
from IPython.display import clear_output
import numpy as np
import bandsInTownHelper as bandsInTownHelper

import pycountry
import country_demonyms

In [3]:
#Load the total_x events data into several DataFrame.

total_eventsch = pd.read_csv(os.path.join('./total_eventsch.csv'))
total_bands_in_town = pd.read_csv(os.path.join('./total_bands_in_town.csv'))
total_residentadvisor = pd.read_csv(os.path.join('./total_residentadvisor.csv'))
total_routedesfestivals = pd.read_csv(os.path.join('./total_routedesfestivals.csv'))

## Unifying the representation
The data gathered on several platforms came in a handful of shapes that we should now normalize while retaining as much information. We will discards platforms only'ids.

#### Events.ch
à compléter

In [4]:
total_eventsch['Date'] = pd.to_datetime(total_eventsch['Date'])
total_eventsch['Date'] = total_eventsch['Date'].apply( lambda x: x.date() )

#### BandsInTown
à compléter

In [5]:
total_bands_in_town.drop(['artist_url', 'event_id', 'event_url', 'event_venue.region', 'event_venue.url', 'event_venue.id'], 1, inplace=True)

#Convert time column to datetime objects
total_bands_in_town['event_datetime'] = pd.to_datetime(total_bands_in_town['event_datetime'])
#Remove time from dates
total_bands_in_town['event_datetime'] = total_bands_in_town['event_datetime'].apply( lambda x: x.date() )

In [6]:
total_bands_in_town.head(10)

Unnamed: 0,artist_name,event_datetime,event_venue.city,event_venue.latitude,event_venue.longitude,event_venue.name
0,Groombridge,2006-01-06,Langenthal,47.21206,7.789998,Rock in Church
1,Painhead,2006-01-07,Rorschach,47.477928,9.49519,Hafenbuffet
2,shEver,2006-01-14,Zug,47.18222,8.52076,Industrie 45
3,Painhead,2006-01-15,Gossau (Sankt Gallen),47.414415,9.25495,The Office
4,Mando Diao,2006-01-21,Laax,46.8,9.25,PALACE CLUB AT RIDERS PALACE
5,Foo Fighters,2006-01-25,Winterthur,47.495655,8.74848,Eishalle Duetwag
6,Groombridge,2006-01-27,Burgdorf,47.05,7.616667,Gymfest
7,Groombridge,2007-01-01,Berne,46.948432,7.440461,ONO
8,Painhead,2007-01-06,Sommeri,47.566667,9.283333,Löwenarena
9,shEver,2007-01-13,Zurich,47.38662,8.53438,Werk21


##### Venues location
Bands in Town data associates coordinates to each venue, which could be interesting for representing our findings on maps. We will extract those in a separate DataFrame, and then drop the coordinates from the events DataFrame. For simplicity, we will leave in the venue and city columns. We will come back to the venues DataFrame later on.

In [7]:
#Better to drop the coordinates from the event frame for clarity
venues = total_bands_in_town[['event_venue.name', 'event_venue.city', 'event_venue.latitude', 'event_venue.longitude']].copy()
total_bands_in_town.drop(['event_venue.latitude', 'event_venue.longitude'], 1, inplace=True)

venues.drop_duplicates(subset=['event_venue.longitude', 'event_venue.latitude'], inplace=True)
venues.drop_duplicates('event_venue.name', inplace=True)
venues.set_index('event_venue.name', drop=True, append=False, inplace=True)
del venues.index.name
venues = venues.sort_index()

venues_ra = total_residentadvisor[['club_name', 'club_adress']].copy()
venues_ra.drop_duplicates('club_name', inplace=True)
venues_ra.set_index('club_name', drop=True, append=False, inplace=True)
del venues_ra.index.name

total_bands_in_town.columns = ['Artist', 'Date', 'City', 'Venue']

#### ResidentAdvisor
à compléter

In [8]:
#Extract cities from addresses of clubs in ResidentAdvisor data
for i, address in zip(total_residentadvisor.index, total_residentadvisor.club_adress) :
    total_residentadvisor.set_value(i, 'City', address.rsplit(None, 1)[-1])
    

#Manually fill wrong entries
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '2' ].index.tolist(), 'City', 'Zurich')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '1009' ].index.tolist(), 'City', 'Pully')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '4001' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '1227' ].index.tolist(), 'City', 'Geneva')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == 'BL' ].index.tolist(), 'City', 'Münchenstein')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '40/42' ].index.tolist(), 'City', 'Wetzikon')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '22' ].index.tolist(), 'City', 'Bern')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '/' ].index.tolist(), 'City', 'Murten')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '253' ].index.tolist(), 'City', 'Les Diablerets')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '14' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '10' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '-' ].index.tolist(), 'City', 'Biel/Bienne')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '(GR)' ].index.tolist(), 'City', 'Klosters')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == 'ZH' ].index.tolist(), 'City', 'Zurich')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '4058' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '1' ].index.tolist(), 'City', 'Baden')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '6020' ].index.tolist(), 'City', 'Emmenbrücke')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '8143' ].index.tolist(), 'City', 'Zurich')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '2,6612,Ascona,(Ti),CH' ].index.tolist(), 'City', 'Ascona')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == 'ZG' ].index.tolist(), 'City', 'Baar ')

total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Rue des Grands-Vergers, 1957 Ardon, CH' ].index.tolist(), 'City', 'Ardon')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Av. de Tivoli 3, Fribourg, 1700, CH' ].index.tolist(), 'City', 'Fribourg')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Köniz, 3098, 9 Schulhausgässli, CH' ].index.tolist(), 'City', 'Köniz')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Unterer Graben 17, 9000 St. Gallen, CH' ].index.tolist(), 'City', 'St. Gallen')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Viale Castagnola 6, 6900 Lugano, CH' ].index.tolist(), 'City', 'Lugano')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Via Alla Foce 1, 6982 Agno, Ticino, CH' ].index.tolist(), 'City', 'Agno')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Via Pioda 12, 6900 Lugano, CH' ].index.tolist(), 'City', 'Lugano')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Ancienne-Pointe 16, 1920 Martigny, CH' ].index.tolist(), 'City', 'Martigny')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Freilager-Platz 9, 4142 Münchenstein/Basel, CH' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Via Industria 4; 6814, Lamone Ticino' ].index.tolist(), 'City', 'Lamone')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Ponte Capriasca, Ticino' ].index.tolist(), 'City', 'Ponte Capriasca')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Case postale 352 Crans, Valais' ].index.tolist(), 'City', 'Crans-Montana')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Place centrale, 1997 Nendaz, Valais' ].index.tolist(), 'City', 'Nendaz')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Place de la Gare, 1957 Ardon, Valais, Suisse' ].index.tolist(), 'City', 'Ardon')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Rue du Commerce 122, 2300 La Chaux-de-Fonds, Suisse' ].index.tolist(), 'City', 'La Chaux-de-Fonds')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Place Centrale, 1870 Monthey, Valais, Suisse' ].index.tolist(), 'City', 'Monthey')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Chemin des Batailles, 1214 VERNIER (GENEVE), Suisse' ].index.tolist(), 'City', 'Vernier')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Rue de Vevey 34, 1630 Bulle, SWITZERLAND' ].index.tolist(), 'City', 'Bulle')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Sant Gallen, SWITZERLAND' ].index.tolist(), 'City', 'St. Gallen')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Steinberggasse 16' ].index.tolist(), 'City', 'Winterthur')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'St. Annagasse 16' ].index.tolist(), 'City', 'Zurich')

total_residentadvisor.drop(total_residentadvisor.loc[total_residentadvisor['City'] == 'Liechtenstein' ].index.tolist(), inplace=True)

total_residentadvisor.drop(['club_adress'], 1, inplace=True)

#Convert events date to datetime objects
for i, date in zip(total_residentadvisor.index, total_residentadvisor.date) :
    total_residentadvisor.set_value(i, 'date', pd.to_datetime(date[5:]))
#Remove hour from dates
total_residentadvisor['date'] = total_residentadvisor['date'].apply( lambda x: x.date() )        

total_residentadvisor.columns = ['Venue', 'Date', 'Artist', 'City']

In [9]:
total_residentadvisor.head(10)

Unnamed: 0,Venue,Date,Artist,City
0,2. Akt Restaurant & Bar,2015-09-19,Affani,Zurich
1,2. Akt Restaurant & Bar,2015-09-19,Mark Faermont,Zurich
2,2. Akt Restaurant & Bar,2015-09-12,Mucho Stylez,Zurich
3,2. Akt Restaurant & Bar,2015-07-03,Mucho Stylez,Zurich
4,2. Akt Restaurant & Bar,2015-06-13,Mucho Stylez,Zurich
5,2. Akt Restaurant & Bar,2015-03-21,Mark Faermont,Zurich
6,2. Akt Restaurant & Bar,2015-01-17,Carlos Russo,Zurich
7,2. Akt Restaurant & Bar,2015-01-17,Mark Faermont,Zurich
8,2. Akt Restaurant & Bar,2014-11-29,Tonka,Zurich
9,2. Akt Restaurant & Bar,2014-11-29,Mark Faermont,Zurich


#### RouteDesFestivals
à compléter

In [10]:
#make the three time columns into a single date column

total_routedesfestivals.month.unique()
for i, month in zip(total_routedesfestivals.index, total_routedesfestivals.month) :
    if month == 'Jan.':
        total_routedesfestivals.set_value(i, 'month', 1.0)
    if month == 'Fev.':
        total_routedesfestivals.set_value(i, 'month', 2.0)
    if month == 'Mar.':
        total_routedesfestivals.set_value(i, 'month', 3.0)
    if month == 'Avr.':
        total_routedesfestivals.set_value(i, 'month', 4.0)    
    if month == 'Mai':
        total_routedesfestivals.set_value(i, 'month', 5.0)   
    if month == 'Juin':
        total_routedesfestivals.set_value(i, 'month', 6.0)      
    if month == 'Juil.':
        total_routedesfestivals.set_value(i, 'month', 7.0)
    if month == 'Aout':
        total_routedesfestivals.set_value(i, 'month', 8.0)    
    if month == 'Sep.':
        total_routedesfestivals.set_value(i, 'month', 9.0)
    if month == 'Oct.':
        total_routedesfestivals.set_value(i, 'month', 10.0)        
    if month == 'Nov.':
        total_routedesfestivals.set_value(i, 'month', 11.0)   
    if month == 'Dec.':
        total_routedesfestivals.set_value(i, 'month', 12.0)
        
total_routedesfestivals.dropna(0, inplace=True)    

total_routedesfestivals.month.apply(lambda x: pd.to_numeric(x))
total_routedesfestivals['Date'] = pd.to_datetime(total_routedesfestivals.year*10000 + total_routedesfestivals.month*100 + total_routedesfestivals.day, format="%Y%m%d")
total_routedesfestivals.drop(['day', 'month', 'year'], 1, inplace=True)

#We add this line to avoid type error on the datetimeindex when concatenating all frames
total_routedesfestivals['Date'] = pd.to_datetime(total_routedesfestivals['Date'])
#Remove hour from dates
total_routedesfestivals['Date'] = total_routedesfestivals['Date'].apply( lambda x: x.date() )

total_routedesfestivals.columns = ['Venue', 'Artist', 'City', 'Date']

In [11]:
total_routedesfestivals.head(10)

Unnamed: 0,Venue,Artist,City,Date
0,6 HOURS OF SYMPHONIA,SYNMETALIUM,Lausanne,2017-04-01
1,6 HOURS OF SYMPHONIA,EVENMORE,Lausanne,2017-04-01
2,6 HOURS OF SYMPHONIA,SECHEM,Lausanne,2017-04-01
3,6 HOURS OF SYMPHONIA,BEYOND FORGIVENESS,Lausanne,2017-04-01
4,ANTIGEL,ZERO,Geneve,2017-01-27
5,ANTIGEL,MAY B (MAGUY MARIN),Geneve,2017-01-28
6,ANTIGEL,MAY B (MAGUY MARIN),Geneve,2017-01-29
7,ANTIGEL,TRENTEMOLLER,Geneve,2017-02-02
8,ANTIGEL,THE NOTWIST,Geneve,2017-02-03
9,ANTIGEL,HENRI DES,Geneve,2017-02-05


#### Putting it all together

In [217]:
total_events = pd.concat([total_eventsch, total_bands_in_town, total_routedesfestivals, total_residentadvisor])

total_events.set_index('Date', drop=True, append=False, inplace=True)
del total_events.index.name

#TypeError: can't compare datetime.datetime to datetime.date
total_events = total_events.sort_index()

total_events.shape

(187447, 4)

In [218]:
#Write the DataFrame to a csv file
filename = 'total_events.csv'
pd.DataFrame(total_events, columns=list(total_events.columns)).to_csv(filename, index=False, encoding="utf-8")
print('Total events data saved to file')

Total events data saved to file


## Genres and origins

Divide the data we have after calling the MusicGraph API into two subsets : one which has value filled in nicely (~35%) which  we'll call clean, another one with 'assumed' correct artist names but missing genre and origin information, and a third where information is missing, and rows may contain more than one artist in their name. The last subset may require extra handling care with regard to the events frame.
BIG PART of exploratory data analysis


In [356]:
#Get Spotify genre data
total_spotify    = pd.read_csv(os.path.join('./total_artists_Spotify.csv'))
total_spotify    = total_spotify.loc[pd.isnull(total_spotify['genre']) == False]
#Get MusicGraph genre and origin data
total_musicgraph = pd.read_csv(os.path.join('./total_artists_MusicGraph.csv'))


musicgraph_missing = total_musicgraph.loc[total_musicgraph['no_result'] == 1]
musicgraph_several = musicgraph_missing[musicgraph_missing.name.str.contains(',')]
musicgraph_missing = musicgraph_missing.select(lambda x: x not in musicgraph_several.index)

First, we will try to fill in the missing genre value with the data acquired from Spotify. To do so, we first have to clean Spotify data, which gives us very specific genres (720 !) instead of global names such as MusicGenre. Some origin information may also be included in the specific genres, which we should look for carefully.


In [357]:
#Create a dict of Country adjective to Country name
country_dict = {}
for key, value in country_demonyms.COUNTRY_DEMONYMS.items():
    country_dict[value.lower()] = key.lower().title()
    
country_dict['persian'] = 'Iran'
country_dict['breton'] = 'France'
country_dict['argentine'] = 'Argentina'
country_dict['fado'] = 'Portugal'
country_dict['quebecois'] = 'Canada'
country_dict['americana'] = 'United States'
country_dict['j-ambient'] = 'Japan'
country_dict['k-pop'] = 'Korea'
country_dict['uk'] = 'United Kingdom'
country_dict['k-indie'] = 'Korea'
country_dict['j-reggae'] = 'Japan'
country_dict['j-metal'] = 'Japan'
country_dict['j-core'] = 'Japan'
country_dict['j-punk'] = 'Japan'
country_dict['sertanejo'] = 'Brasil'
country_dict['japanoise'] = 'Japan'
country_dict['magyar'] = 'Hungary'
country_dict['j-rock'] = 'Japan'
country_dict['francais'] = 'France'
country_dict['chalga'] = 'Bulgaria'
country_dict['napoletana'] = 'Italy'
country_dict['bhangra'] = 'India'
country_dict['carnatic'] = 'India'
country_dict['forro'] = 'Brasil'
country_dict['entehno'] = 'Greece'
country_dict['bay'] = 'United States'
country_dict['schlager'] = 'Germany'
country_dict['coast'] = 'United States'
country_dict['j-dance'] = 'Japan'
country_dict['k-hop'] = 'Korea'
country_dict['francoton'] = 'France'
country_dict['corsican'] = 'France'
country_dict['british'] = 'United Kingdom'
country_dict['c-pop'] = 'China'


In [358]:
total_spotify = total_spotify.select(lambda x: (x in musicgraph_several.index) or (x in musicgraph_missing.index))
print('Total number of genres from Spotify :', total_spotify.genre.unique().size)
print('Total helpful lines from Spotify :', total_spotify.name.size)


i=0
for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in country_dict :
            if index in musicgraph_missing.index :
                musicgraph_missing.set_value(index, 'origin', country_dict[word])
            elif index in musicgraph_several.index :
                musicgraph_several.set_value(index, 'origin', country_dict[word])
            i+=1
print('With Spotify, we get the origin of', i, 'more artists.')


Total number of genres from Spotify : 466
Total helpful lines from Spotify : 1067
With Spotify, we get the origin of 183 more artists.


In [359]:
genre_dict = {}

# We will have to simplify as we don't have so much time for thorough classification of genres
Electronica = ['house','aggrotech','danspunk', 'brostep', 'abstract', 'chillwave','drone', 'chill', 'beats', 'experimental','electropunk',  'turbo', 'balearic','dance-punk', 'ebm','edm', 'j-dance', 'chillstep','darkpsy', 'darkstep', 'chalga', 'japanoise', 'lounge', 'psytrance', 'tekno','indietronica', 'electronica',  'techno','disco', 'j-ambient',   'noise', 'bass', 'electroclash', 'wave', 'trance', 'ambient', 'dancehall', 'beat', 'dance', 'dub', 'electro', 'eurodance', 'dubstep', 'electronic', 'psych', 'industrial', 'microhouse', 'electrofox', ]
for key in Electronica:
    genre_dict[key] = 'Electronica/Dance'
Rock = ['rock','rock-and-roll','neo-progressive','tribute','post-screamo', 'boogie-woogie', 'hardstyle', 'speedcore', 'neo-psychedelic', 'ostrock','neo-rockabilly', 'britpop', 'j-punk','grunge','breakcore', 'goregrind','orgcore','j-rock', 'alternative', 'j-core', 'j-metal', 'k-indie', 'screamocore', 'grindcore', 'nerdcore',  'doomcore', 'sludge',   'core','deathcore',  'gamecore', 'metalcore','post-punk', 'garage','thrash','post-metal', 'psychobilly', 'edge', 'mathcore',  'punk', 'emo', 'indie', 'metal', 'hardcore', 'swing', 'djent', 'doom', 'glam', 'oi', 'nwobhm']
for key in Rock:
    genre_dict[key] = 'Rock'
Pop = ['pop','popgaze', 'idol','etherpop','anti-folk',  'chanson','c-pop', 'k-pop', 'europop', 'neo-synthpop', 'synthpop', 'folk-pop', 'freak', 'eurovision', 'futurepop']
for key in Pop:
    genre_dict[key] = 'Pop'
Reggae = ['reggae', 'ska', 'reggaeton', 'euroska','j-reggae' ]
for key in Reggae:
    genre_dict[key] = 'Reggae/Ska'
Jazz = ['jazz', 'bebop', 'ragtime', 'afrobeat']
for key in Jazz:
    genre_dict[key] = 'Jazz'
World = ['rai','accordeon', 'entehno',  'african','schlager','corsican','breton', 'asian', 'british',  'arab','armenian', 'kurdish',  'balkan', 'world', 'napoletana','bhangra', 'polka', 'folkmusik', 'andean', 'panpipe', 'maghreb','magyar',  'fado','traditional', 'quebecois', 'carnatic', 'native', 'klezmer', 'world', 'celtic', 'bangla', 'pagode', 'flamenco', 'throat', 'medieval', 'capoeira']
for key in World:
    genre_dict[key] = 'World'
RB = ['r&b', 'funk', 'funky', 'soul']
for key in RB:
    genre_dict[key] = 'Soul/R&B'
Country = ['bluegrass', 'country', 'barbershop', 'americana', 'bluegrass', 'cajun']
for key in Country:
    genre_dict[key] = 'Country'
Latin = ['forro' ,'nu-cumbia', 'sertanejo', 'salsa','tango','merengue', 'bachata', 'rumba', 'nova', 'latin', 'cumbia']
for key in Latin:
    genre_dict[key] = 'Latin'
Rap = ['hop', 'rap', 'trap', 'k-hop', 'francoton']
for key in Rap:
    genre_dict[key] = 'Rap/Hip Hop'
Blues = ['blues', 'blues-rock']
for key in Blues:
    genre_dict[key] = 'Blues'
Classical = ['cello','cappella',  'concert', 'opera', 'choral', 'clarinet', 'classical', 'violin', 'harpsichord', 'string', 'brass', 'orchestral', 'baroque', 'harp', 'early']
for key in Classical:
    genre_dict[key] = 'Classical/Opera'
Soundtracks = ['movie', 'tunes', 'hollywood', 'soundtrack' ]
for key in Soundtracks:
    genre_dict[key] = 'Soundtracks'
Gospel = ['gospel', 'christian', 'liturgical', 'christmas', 'ccm', 'worship']
for key in Gospel:
    genre_dict[key] = 'Christian/Gospel'
NewAge = ['age', 'kirtan', 'didgeridoo']
for key in NewAge:
    genre_dict[key] = 'New Age'

i=0
for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in genre_dict :
            total_spotify.set_value(index, 'genre', genre_dict[word])
            if index in musicgraph_missing.index :
                musicgraph_missing.set_value(index, 'genre', genre_dict[word])
            elif index in musicgraph_several.index :
                musicgraph_several.set_value(index, 'genre', genre_dict[word])
            i+=1

In [360]:
genres = total_musicgraph.genre.unique()
i = 0
for genre in total_spotify.genre :
    if genre not in genres :
        i+=1
print(i/total_spotify.size*100 , '% of events which subgenre were not parsed')


2.19306466729 % of events which subgenre were not parsed


In [365]:
for index in musicgraph_missing.index :
    if not pd.isnull(musicgraph_missing.loc[index].genre) :
        total_musicgraph.set_value(index, 'genre', musicgraph_missing.loc[index].genre)
        if not pd.isnull(musicgraph_missing.loc[index].origin) :
            total_musicgraph.set_value(index, 'origin', musicgraph_missing.loc[index].origin)
            musicgraph_missing.drop(index, inplace=True)
            
for index in musicgraph_several.index :
    if not pd.isnull(musicgraph_several.loc[index].genre) :
        total_musicgraph.set_value(index, 'genre', musicgraph_several.loc[index].genre)
        if not pd.isnull(musicgraph_several.loc[index].origin) :
            total_musicgraph.set_value(index, 'origin', musicgraph_several.loc[index].origin)
            musicgraph_several.drop(index, inplace=True)

In [370]:
#Intermediary save
filename = 'total_artists_processed.csv'
pd.DataFrame(total_musicgraph, columns=list(total_musicgraph.columns)).to_csv(filename, index=False, encoding="utf-8")
print('Total processed artists data saved to file')

Total processed artists data saved to file


### Artists with unresolved names

In [367]:
musicgraph_missing
musicgraph_several

Unnamed: 0,ambigous_result,genre,name,no_result,origin
602,0,Rock,"THE GOOD, THE BAD AND THE QUEEN",1,
4911,0,,"Police, Adjective",1,
7627,0,,"MASSIVEDRUM - DJ, PRODUCER & REMIXER",1,
10247,0,,"Until You See, Until You Believe",1,
12122,0,,"DieDrums, DJ",1,
12306,0,,"Naturally 7, Tom Freund",1,
12638,0,,"Carnifex, Beneath The Massacre, Molotov Soluti...",1,
13030,0,,"Heaven Shall Burn, Unearth, Rise to Remain, Na...",1,
13032,0,,"FU** ART, LET'S DANCE!",1,
14245,0,,"Ruffpack, Dancehall Soldiers, DJ Peak",1,


In [228]:
artist_dict = {}

# We will have to simplify as we don't have so much time for thorough classification of genres
Electronica = []
    artist_dict[key] = 'Electronica/Dance'

Rock = []
for key in Rock:
    artist_dict[key] = 'Rock'

Pop = []
for key in Pop:
    artist_dict[key] = 'Pop'

Reggae = []
for key in Reggae:
    artist_dict[key] = 'Reggae/Ska'

Jazz = []
for key in Jazz:
    artist_dict[key] = 'Jazz'

World = []
for key in World:
    artist_dict[key] = 'World'

RB = []
for key in RB:
    artist_dict[key] = 'Soul/R&B'

Country = []
for key in Country:
    artist_dict[key] = 'Country'

Latin = []
for key in Latin:
    artist_dict[key] = 'Latin'

Rap = []
for key in Rap:
    artist_dict[key] = 'Rap/Hip Hop'

Blues = []
for key in Blues:
    artist_dict[key] = 'Blues'

Classical = []
for key in Classical:
    artist_dict[key] = 'Classical/Opera'

Soundtracks = []
for key in Soundtracks:
    artist_dict[key] = 'Soundtracks'

Gospel = []
for key in Gospel:
    artist_dict[key] = 'Christian/Gospel'

NewAge = []
for key in NewAge:
    artist_dict[key] = 'New Age'

i=0
for index, genre in zip(musicgraph_several.index, musicgraph_several.name) :
    for word in genre.split() :
        if word in artist_dict :
            musicgraph_several.set_value(index, 'genre', genre_dict[word])
            i+=1
    print(i)


2177


In [369]:
for name in musicgraph_several.name :
    if 'Jazz' in name :
        print(name)
            

In Zusammenarbeit mit der Hochschule für Musik, Abteilung Jazz
Swiss Jazz Orchestra, Latin Night
Flying Contrary Fellows, Carles Peris (fl, sax), Jacques Widmer (dr), Max E. Keller, (p), hosted by Jazz am Mittwoch
Gabriel Beurle, Jazzkollektiv Basel
Marianne Racine Quartet, Marianne Racine (voc, p), Daniel Baschnagel (tp), Patrick Sommer (b), Pius Baschnagel (dr), Reihe: Jazzapero
Jazz Passengers, Jörg Hauser (p), Sabine Siegrist (ts), Rougi Rothenbühler (ss, as, ts), Peter Tico Keller (b), Hannes Hänggli (dr)
Racine & Dreifuss Slowmotion, Rumpel & Racine, Marianne Racine (voc, p), Marco Dreifuss (p), Jojo Kunz (b), Reihe: Jazzapero
Punkt 3, Jazzkollektiv Basel
Der Wawawa, Benedikt Reising (sax), Marco Müller (b), Rico Baumann (dr), Reihe: Jazz am Mittwoch
Lisette Spinnler & Christoph Stiefel, hosted by Jazz im Seefeld
Swiss Jazz Orchestra, tribute night to Dave Holland
Gilad Atzmon Orient House Ensemble (ISR), Gilad Atzmon (sax), Frank Harris (p), Eddie Hicks (dr), Aaron Stavi (b), 50