# Preprocessing pipeline
In this notebook, we will clean and agglomerate the data acquired from several platforms to be used conveniently for analysis.

In [17]:
import pandas as pd
import os
import glob
import urllib
import requests
import time
import json
from pandas.io.json import json_normalize
from IPython.display import clear_output
import numpy as np
import bandsInTownHelper as bandsInTownHelper

import pycountry
import country_demonyms

In [18]:
#Load the total_x events data into several DataFrame.

total_eventsch = pd.read_csv(os.path.join('./total_eventsch.csv'))
total_bands_in_town = pd.read_csv(os.path.join('./total_bands_in_town.csv'))
total_residentadvisor = pd.read_csv(os.path.join('./total_residentadvisor.csv'))
total_routedesfestivals = pd.read_csv(os.path.join('./total_routedesfestivals.csv'))

## Unifying the representation
The data gathered on several platforms came in a handful of shapes that we should now normalize while retaining as much information. We will discards platforms only'ids.

#### Events.ch
à compléter

In [19]:
total_eventsch = pd.read_csv(os.path.join('./total_eventsch.csv'))
total_eventsch['Date'] = pd.to_datetime(total_eventsch['Date'])
total_eventsch['Date'] = total_eventsch['Date'].apply( lambda x: x.date() )

total_eventsch_classic = total_eventsch.loc[total_eventsch.Genre == 'Classic']
total_eventsch = total_eventsch.select(lambda x: x not in total_eventsch_classic.index)
#musicgraph_several = musicgraph_missing[musicgraph_missing.name.str.contains(',')]

df_columns=['Artist', 'Date', 'Genre', 'Venue', 'City']
total_eventsch_processed=pd.DataFrame(columns=df_columns)

for index, artist in zip(total_eventsch.index, total_eventsch.Artist) :
    for name in artist.split(',') :
        name=name.strip()
        if  (len(name) != 0 and name[0].isupper()) :
            new_row = pd.DataFrame([[name, total_eventsch.loc[index].Date, total_eventsch.loc[index].Genre, total_eventsch.loc[index].Venue, total_eventsch.loc[index].City]], columns = df_columns)
            total_eventsch_processed = total_eventsch_processed.append(new_row)
            
total_eventsch_processed.reset_index(drop = True, inplace=True)

In [20]:
#Find origin based on artist name

#Create a dict of Country adjective to Country name
country_dict = {}
for key, value in country_demonyms.COUNTRY_DEMONYMS.items():
    country_dict[value.lower()] = key.lower().title()

country_name = []
country_alpha2 = []
country_alpha3 = []
for country in list(pycountry.countries) :
    if ' ' not in country.name :
        country_name.append(country.name)
    country_alpha2.append(country.alpha_2)
    country_alpha3.append(country.alpha_3)
country_alpha2.remove('DJ')
country_alpha2.remove('MC') 
#not sure this is true
#country_dict['(de)'] = 'Germany'
country_dict['schweizer'] = 'Switzerland'



for index, genre in zip(total_eventsch_processed.index, total_eventsch_processed.Artist) :
    for word in genre.split() :
        if word in country_name :
            total_eventsch_processed.set_value(index, 'origin', word)
        elif (word[1:-1] in country_alpha2 and '(' in word and ')' in word) :
            total_eventsch_processed.set_value(index, 'origin', pycountry.countries.lookup(word[1:-1]).name)
        elif (word[1:-1] in country_alpha3 and '(' in word and ')' in word) :
            total_eventsch_processed.set_value(index, 'origin', pycountry.countries.lookup(word[1:-1]).name)
        elif word.lower() in country_dict :
            total_eventsch_processed.set_value(index, 'origin', country_dict[word.lower()])

total_eventsch_processed.Artist.replace(' \(.*\) .*','', regex=True, inplace=True)
total_eventsch_processed.Artist.replace(' \(.*\)','', regex=True, inplace=True)
total_eventsch_processed.Artist.replace(' \(.*','', regex=True, inplace=True)

In [21]:
total_eventsch_parsed = pd.concat([total_eventsch_processed, total_eventsch_classic])
total_eventsch_parsed

#Saved parsed events to csv
filename = 'total_eventsch_parsed.csv'
pd.DataFrame(total_eventsch_parsed, columns=list(total_eventsch_parsed.columns)).to_csv(filename, index=True, encoding="utf-8")
print('Total parsed event data from Events.ch saved to file')

Total parsed event data from Events.ch saved to file


In [47]:
total_eventsch_parsed.head()
total_eventsch_parsed.rename(columns={'Genre': 'genre'}, inplace=True)
total_eventsch_parsed['Latitude'] = np.nan
total_eventsch_parsed['Longitude'] = np.nan
total_eventsch_parsed['Adress'] = np.nan
total_eventsch_parsed.head()

Unnamed: 0,Artist,City,Date,genre,Venue,origin,Latitude,Longitude,Adress
0,DJs Patric Pleasure,Basel,2017-01-20,"Hip Hop, R'n'B",Balz,,,,
1,Ramon Ramones,Basel,2017-01-20,"Hip Hop, R'n'B",Balz,,,,
2,Bülent Ceylan,Zürich,2017-01-19,"Ragga, Reggae, African Music, Dancehall",Vior Club,Germany,,,
3,Raffi Lusso,Zürich,2017-01-19,"Ragga, Reggae, African Music, Dancehall",Vior Club,,,,
4,Miguel M,Zürich,2017-01-19,"Ragga, Reggae, African Music, Dancehall",Vior Club,,,,


In [22]:
total_eventsch_artists = total_eventsch_processed.copy()
total_eventsch_artists.drop('Date', axis=1, inplace = True)
total_eventsch_artists.drop('Venue', axis=1, inplace = True)
total_eventsch_artists.drop('City', axis=1, inplace = True)
total_eventsch_artists.columns = ['name', 'genre', 'origin']
total_eventsch_artists["ambigous_result"] = np.nan
total_eventsch_artists["no_result"] = np.nan
total_eventsch_artists.drop_duplicates('name', inplace= True)
total_eventsch_artists.name.size


19179

#### BandsInTown
à compléter

In [34]:
total_bands_in_town = pd.read_csv(os.path.join('./total_bands_in_town.csv'))
total_bands_in_town.drop(['artist_url', 'event_id', 'event_url', 'event_venue.region', 'event_venue.url', 'event_venue.id'], 1, inplace=True)

#Convert time column to datetime objects
total_bands_in_town['event_datetime'] = pd.to_datetime(total_bands_in_town['event_datetime'])
#Remove time from dates
total_bands_in_town['event_datetime'] = total_bands_in_town['event_datetime'].apply( lambda x: x.date() )

In [35]:
total_bands_in_town.head(10)

Unnamed: 0,artist_name,event_datetime,event_venue.city,event_venue.latitude,event_venue.longitude,event_venue.name
0,Groombridge,2006-01-06,Langenthal,47.21206,7.789998,Rock in Church
1,Painhead,2006-01-07,Rorschach,47.477928,9.49519,Hafenbuffet
2,shEver,2006-01-14,Zug,47.18222,8.52076,Industrie 45
3,Painhead,2006-01-15,Gossau (Sankt Gallen),47.414415,9.25495,The Office
4,Mando Diao,2006-01-21,Laax,46.8,9.25,PALACE CLUB AT RIDERS PALACE
5,Foo Fighters,2006-01-25,Winterthur,47.495655,8.74848,Eishalle Duetwag
6,Groombridge,2006-01-27,Burgdorf,47.05,7.616667,Gymfest
7,Groombridge,2007-01-01,Berne,46.948432,7.440461,ONO
8,Painhead,2007-01-06,Sommeri,47.566667,9.283333,Löwenarena
9,shEver,2007-01-13,Zurich,47.38662,8.53438,Werk21


In [36]:
total_bands_in_town.columns = ['Artist', 'Date', 'City', 'Latitude', 'Longitude', 'Venue']
total_bands_in_town['genre'] = np.nan
total_bands_in_town['origin'] = np.nan
total_bands_in_town['Adress'] = np.nan
total_bands_in_town

Unnamed: 0,Artist,Date,City,Latitude,Longitude,Venue,genre,origin,Adress
0,Groombridge,2006-01-06,Langenthal,47.212060,7.789998,Rock in Church,,,
1,Painhead,2006-01-07,Rorschach,47.477928,9.495190,Hafenbuffet,,,
2,shEver,2006-01-14,Zug,47.182220,8.520760,Industrie 45,,,
3,Painhead,2006-01-15,Gossau (Sankt Gallen),47.414415,9.254950,The Office,,,
4,Mando Diao,2006-01-21,Laax,46.800000,9.250000,PALACE CLUB AT RIDERS PALACE,,,
5,Foo Fighters,2006-01-25,Winterthur,47.495655,8.748480,Eishalle Duetwag,,,
6,Groombridge,2006-01-27,Burgdorf,47.050000,7.616667,Gymfest,,,
7,Groombridge,2007-01-01,Berne,46.948432,7.440461,ONO,,,
8,Painhead,2007-01-06,Sommeri,47.566667,9.283333,Löwenarena,,,
9,shEver,2007-01-13,Zurich,47.386620,8.534380,Werk21,,,


##### Venues location
Bands in Town data associates coordinates to each venue, which could be interesting for representing our findings on maps. We will extract those in a separate DataFrame, and then drop the coordinates from the events DataFrame. For simplicity, we will leave in the venue and city columns. We will come back to the venues DataFrame later on.

In [9]:
#Better to drop the coordinates from the event frame for clarity. We'll add them back later
venues = total_bands_in_town[['event_venue.name', 'event_venue.city', 'event_venue.latitude', 'event_venue.longitude']].copy()
total_bands_in_town.drop(['event_venue.latitude', 'event_venue.longitude'], 1, inplace=True)

venues.drop_duplicates(subset=['event_venue.longitude', 'event_venue.latitude'], inplace=True)
venues.drop_duplicates('event_venue.name', inplace=True)
venues.set_index('event_venue.name', drop=True, append=False, inplace=True)
del venues.index.name
venues = venues.sort_index()

venues_ra = total_residentadvisor[['club_name', 'club_adress']].copy()
venues_ra.drop_duplicates('club_name', inplace=True)
venues_ra.set_index('club_name', drop=True, append=False, inplace=True)
del venues_ra.index.name

total_bands_in_town.columns = ['Artist', 'Date', 'City', 'Venue']

#### ResidentAdvisor
à compléter

In [28]:
total_residentadvisor.head(10)

Unnamed: 0,club_adress,club_name,date,artiste
0,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 19 Sep 2015",Affani
1,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 19 Sep 2015",Mark Faermont
2,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 12 Sep 2015",Mucho Stylez
3,Selnaustrasse 2,2. Akt Restaurant & Bar,"Fri, 3 Jul 2015",Mucho Stylez
4,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 13 Jun 2015",Mucho Stylez
5,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 21 Mar 2015",Mark Faermont
6,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 17 Jan 2015",Carlos Russo
7,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 17 Jan 2015",Mark Faermont
8,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 29 Nov 2014",Tonka
9,Selnaustrasse 2,2. Akt Restaurant & Bar,"Sat, 29 Nov 2014",Mark Faermont


In [38]:
total_residentadvisor = pd.read_csv(os.path.join('./total_residentadvisor.csv'))

#Extract cities from addresses of clubs in ResidentAdvisor data
for i, address in zip(total_residentadvisor.index, total_residentadvisor.club_adress) :
    total_residentadvisor.set_value(i, 'City', address.rsplit(None, 1)[-1])
    

#Manually fill wrong entries
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '2' ].index.tolist(), 'City', 'Zurich')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '1009' ].index.tolist(), 'City', 'Pully')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '4001' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '1227' ].index.tolist(), 'City', 'Geneva')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == 'BL' ].index.tolist(), 'City', 'Münchenstein')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '40/42' ].index.tolist(), 'City', 'Wetzikon')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '22' ].index.tolist(), 'City', 'Bern')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '/' ].index.tolist(), 'City', 'Murten')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '253' ].index.tolist(), 'City', 'Les Diablerets')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '14' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '10' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '-' ].index.tolist(), 'City', 'Biel/Bienne')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '(GR)' ].index.tolist(), 'City', 'Klosters')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == 'ZH' ].index.tolist(), 'City', 'Zurich')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '4058' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '1' ].index.tolist(), 'City', 'Baden')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '6020' ].index.tolist(), 'City', 'Emmenbrücke')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '8143' ].index.tolist(), 'City', 'Zurich')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == '2,6612,Ascona,(Ti),CH' ].index.tolist(), 'City', 'Ascona')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['City'] == 'ZG' ].index.tolist(), 'City', 'Baar ')

total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Rue des Grands-Vergers, 1957 Ardon, CH' ].index.tolist(), 'City', 'Ardon')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Av. de Tivoli 3, Fribourg, 1700, CH' ].index.tolist(), 'City', 'Fribourg')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Köniz, 3098, 9 Schulhausgässli, CH' ].index.tolist(), 'City', 'Köniz')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Unterer Graben 17, 9000 St. Gallen, CH' ].index.tolist(), 'City', 'St. Gallen')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Viale Castagnola 6, 6900 Lugano, CH' ].index.tolist(), 'City', 'Lugano')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Via Alla Foce 1, 6982 Agno, Ticino, CH' ].index.tolist(), 'City', 'Agno')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Via Pioda 12, 6900 Lugano, CH' ].index.tolist(), 'City', 'Lugano')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Ancienne-Pointe 16, 1920 Martigny, CH' ].index.tolist(), 'City', 'Martigny')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Freilager-Platz 9, 4142 Münchenstein/Basel, CH' ].index.tolist(), 'City', 'Basel')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Via Industria 4; 6814, Lamone Ticino' ].index.tolist(), 'City', 'Lamone')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Ponte Capriasca, Ticino' ].index.tolist(), 'City', 'Ponte Capriasca')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Case postale 352 Crans, Valais' ].index.tolist(), 'City', 'Crans-Montana')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Place centrale, 1997 Nendaz, Valais' ].index.tolist(), 'City', 'Nendaz')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Place de la Gare, 1957 Ardon, Valais, Suisse' ].index.tolist(), 'City', 'Ardon')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Rue du Commerce 122, 2300 La Chaux-de-Fonds, Suisse' ].index.tolist(), 'City', 'La Chaux-de-Fonds')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Place Centrale, 1870 Monthey, Valais, Suisse' ].index.tolist(), 'City', 'Monthey')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Chemin des Batailles, 1214 VERNIER (GENEVE), Suisse' ].index.tolist(), 'City', 'Vernier')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Rue de Vevey 34, 1630 Bulle, SWITZERLAND' ].index.tolist(), 'City', 'Bulle')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Sant Gallen, SWITZERLAND' ].index.tolist(), 'City', 'St. Gallen')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'Steinberggasse 16' ].index.tolist(), 'City', 'Winterthur')
total_residentadvisor.set_value(total_residentadvisor.loc[total_residentadvisor['club_adress'] == 'St. Annagasse 16' ].index.tolist(), 'City', 'Zurich')

total_residentadvisor.drop(total_residentadvisor.loc[total_residentadvisor['City'] == 'Liechtenstein' ].index.tolist(), inplace=True)

#total_residentadvisor.drop(['club_adress'], 1, inplace=True)

#Convert events date to datetime objects
for i, date in zip(total_residentadvisor.index, total_residentadvisor.date) :
    total_residentadvisor.set_value(i, 'date', pd.to_datetime(date[5:]))
#Remove hour from dates
total_residentadvisor['date'] = total_residentadvisor['date'].apply( lambda x: x.date() )        

total_residentadvisor.columns = ['Adress', 'Venue', 'Date', 'Artist', 'City']
total_residentadvisor['genre'] = np.nan
total_residentadvisor['origin'] = np.nan
total_residentadvisor['Latitude'] = np.nan
total_residentadvisor['Longitude'] = np.nan

In [62]:
total_residentadvisor.Adress.unique()

array(['Selnaustrasse 2', 'Pfingstweidstrasse 102, 8005 Zürich',
       'Nüschelerstrasse 31, 8001 Zürich',
       'Alexander-Schönistrasse 17, 2502, Biel/Bienne',
       'Heinrichstrasse 262, 8005 Zürich', 'Mühlenplatz 11, Bern',
       'Chemin du Stand 5; Pully, 1009',
       'Pfingstweidstrasse 6, 8005 Zürich',
       'Place de la Gare, 1957 Ardon, Valais, Suisse',
       'Binningerstrasse 14; 4051, Basel',
       'Seidenhofstrasse 5; 6002, Luzern',
       'Niederdorfstrasse 64, 8001 Zürich', 'Steinberggasse 16',
       'Rue de la Rôtisserie 10; 1204, Genève',
       'Limmatstrasse 275, 8005 Zürich', 'Sihlquai 268; 8005, Zürich',
       'Kanonengasse 16; 8004, Zürich', 'Zollstrasse 80, 8005 Zürich',
       "Esplanade de l'ancienne gare, 1700 Fribourg",
       'Rue du Commerce 122, 2300 La Chaux-de-Fonds, Suisse',
       'Binningerstrasse 14', 'Rheingasse 8, 4058 Basel',
       'Lehenmattstrasse 353, 4052 Basel',
       'Via Angelo Maspoli, 24 CH-6850 Mendrisio',
       'Rue Henry-Gr

#### RouteDesFestivals
à compléter

In [40]:
total_routedesfestivals = pd.read_csv(os.path.join('./total_routedesfestivals.csv'))
#make the three time columns into a single date column

total_routedesfestivals.month.unique()
for i, month in zip(total_routedesfestivals.index, total_routedesfestivals.month) :
    if month == 'Jan.':
        total_routedesfestivals.set_value(i, 'month', 1.0)
    if month == 'Fev.':
        total_routedesfestivals.set_value(i, 'month', 2.0)
    if month == 'Mar.':
        total_routedesfestivals.set_value(i, 'month', 3.0)
    if month == 'Avr.':
        total_routedesfestivals.set_value(i, 'month', 4.0)    
    if month == 'Mai':
        total_routedesfestivals.set_value(i, 'month', 5.0)   
    if month == 'Juin':
        total_routedesfestivals.set_value(i, 'month', 6.0)      
    if month == 'Juil.':
        total_routedesfestivals.set_value(i, 'month', 7.0)
    if month == 'Aout':
        total_routedesfestivals.set_value(i, 'month', 8.0)    
    if month == 'Sep.':
        total_routedesfestivals.set_value(i, 'month', 9.0)
    if month == 'Oct.':
        total_routedesfestivals.set_value(i, 'month', 10.0)        
    if month == 'Nov.':
        total_routedesfestivals.set_value(i, 'month', 11.0)   
    if month == 'Dec.':
        total_routedesfestivals.set_value(i, 'month', 12.0)
        
total_routedesfestivals.dropna(0, inplace=True)    

total_routedesfestivals.month.apply(lambda x: pd.to_numeric(x))
total_routedesfestivals['Date'] = pd.to_datetime(total_routedesfestivals.year*10000 + total_routedesfestivals.month*100 + total_routedesfestivals.day, format="%Y%m%d")
total_routedesfestivals.drop(['day', 'month', 'year'], 1, inplace=True)

#We add this line to avoid type error on the datetimeindex when concatenating all frames
total_routedesfestivals['Date'] = pd.to_datetime(total_routedesfestivals['Date'])
#Remove hour from dates
total_routedesfestivals['Date'] = total_routedesfestivals['Date'].apply( lambda x: x.date() )

total_routedesfestivals.columns = ['Venue', 'Artist', 'City', 'Date']
total_routedesfestivals['genre'] = np.nan
total_routedesfestivals['origin'] = np.nan
total_routedesfestivals['Latitude'] = np.nan
total_routedesfestivals['Longitude'] = np.nan
total_routedesfestivals['Adress'] = np.nan

In [41]:
total_routedesfestivals.head(10)

Unnamed: 0,Venue,Artist,City,Date,genre,origin,Latitude,Longitude,Adress
0,6 HOURS OF SYMPHONIA,SYNMETALIUM,Lausanne,2017-04-01,,,,,
1,6 HOURS OF SYMPHONIA,EVENMORE,Lausanne,2017-04-01,,,,,
2,6 HOURS OF SYMPHONIA,SECHEM,Lausanne,2017-04-01,,,,,
3,6 HOURS OF SYMPHONIA,BEYOND FORGIVENESS,Lausanne,2017-04-01,,,,,
4,ANTIGEL,ZERO,Geneve,2017-01-27,,,,,
5,ANTIGEL,MAY B (MAGUY MARIN),Geneve,2017-01-28,,,,,
6,ANTIGEL,MAY B (MAGUY MARIN),Geneve,2017-01-29,,,,,
7,ANTIGEL,TRENTEMOLLER,Geneve,2017-02-02,,,,,
8,ANTIGEL,THE NOTWIST,Geneve,2017-02-03,,,,,
9,ANTIGEL,HENRI DES,Geneve,2017-02-05,,,,,


#### Putting it all together

In [55]:
total_events = pd.DataFrame(columns=total_eventsch_parsed.columns)
total_events = pd.concat([total_eventsch_parsed, total_bands_in_town, total_routedesfestivals, total_residentadvisor])

#otal_events.set_index('Date', drop=True, append=False, inplace=True)
#del total_events.index.name

#TypeError: can't compare datetime.datetime to datetime.date
total_events.reset_index(drop = True, inplace=True)


In [57]:
total_events.index.size

204815

In [65]:
#Write the DataFrame to a csv file
filename = 'total_events.csv'
pd.DataFrame(total_events, columns=list(total_events.columns)).to_csv(filename, index=False, encoding="utf-8")
print('Total events data saved to file')

Total events data saved to file


## Genres and origins

Divide the data we have after calling the MusicGraph API into two subsets : one which has value filled in nicely (~35%) which  we'll call clean, another one with 'assumed' correct artist names but missing genre and origin information, and a third where information is missing, and rows may contain more than one artist in their name. The last subset may require extra handling care with regard to the events frame.
BIG PART of exploratory data analysis


In [60]:
#Get Spotify genre data
total_spotify    = pd.read_csv(os.path.join('./total_artists_Spotify.csv'))
total_spotify    = total_spotify.loc[pd.isnull(total_spotify['genre']) == False]
#Get MusicGraph genre and origin data
total_musicgraph = pd.read_csv(os.path.join('./total_artists_MusicGraph.csv'))

total_eventsch_artist_del = pd.DataFrame(pd.read_csv(os.path.join('./total_eventsch.csv')))
total_eventsch_artist_del

total_eventsch_artist_del.drop(['Date'], 1, inplace=True)
total_eventsch_artist_del.drop(['Genre'], 1, inplace=True)
total_eventsch_artist_del.drop(['Venue'], 1, inplace=True)
total_eventsch_artist_del.drop(['City'], 1, inplace=True)
total_eventsch_artist_del.drop_duplicates(['Artist'], inplace=True)
total_eventsch_artist_del.set_index('Artist', drop=True, append=False, inplace=True)
del total_eventsch_artist_del.index.name


total_musicgraph.set_index('name', drop=True, append=False, inplace=True)
del total_musicgraph.index.name

print(total_musicgraph.index.size)

for index in total_eventsch_artist_del.index :
        if index in total_musicgraph.index:
            total_musicgraph.drop(index, inplace=True)
            
total_eventsch_artists.set_index('name', drop=True, append=False, inplace=True)
del total_eventsch_artists.index.name

print(total_musicgraph.index.size)


print(total_musicgraph.index.size)

total_musicgraph

62000
47457
47457


Unnamed: 0,ambigous_result,genre,no_result,origin
Colin Dale,0,Electronica/Dance,0,England
Pino Arduini,0,Electronica/Dance,0,
Jeff Mills,0,Electronica/Dance,0,United States of America
Paul Van Dyk,0,Electronica/Dance,0,Germany
DJ Hell,0,Electronica/Dance,0,Germany
Max Durante,0,Electronica/Dance,0,Italy
Willow,0,Electronica/Dance,0,United States of America
St.Paul,0,,1,
Patrick Zigon,0,Electronica/Dance,0,
Oliver Klein,0,Electronica/Dance,0,Germany


In [61]:
total_musicgraph.index.size

47457

In [63]:
print('genres missing before', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing before', (pd.isnull(total_musicgraph.origin)).sum())
print('rows in eventsch before', total_eventsch_artists.index.size)
i=0
for index in total_eventsch_artists.index :
    if index in total_musicgraph.index:
        if pd.isnull(total_musicgraph.loc[index].genre):
            total_musicgraph.set_value(index, 'genre', total_eventsch_artists.loc[index].genre)

        if ((pd.isnull(total_eventsch_artists.loc[index].origin) == False) and (pd.isnull(total_musicgraph.loc[index].origin))) :
            total_musicgraph.set_value(index, 'origin', total_eventsch_artists.loc[index].origin)
                     
        total_eventsch_artists.drop(index, inplace=True)    

print('genres missing after', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing after', (pd.isnull(total_musicgraph.origin)).sum())
print('rows in eventsch after', total_eventsch_artists.index.size)

genres missing before 29769
origins missing before 34056
rows in eventsch before 19179
180
genres missing after 28862
origins missing after 33876
rows in eventsch after 16057


### Events.ch OK 

In [74]:
total_eventsch_artists.index.name = 'name'
total_eventsch_artists.reset_index()
#Write the DataFrame to a csv file
filename = 'total_eventsch_artists.csv'
pd.DataFrame(total_eventsch_artists, columns=list(total_eventsch_artists.columns)).to_csv(filename, index=True, encoding="utf-8")
print('Total events data saved to file')
#OK

Total events data saved to file


In [513]:
total_musicgraph2 = total_musicgraph2.copy()
total_musicgraph.index.size

47457

In [66]:
musicgraph_missing = total_musicgraph.loc[(pd.isnull(total_musicgraph.genre) | pd.isnull(total_musicgraph.origin))]
musicgraph_several = musicgraph_missing[musicgraph_missing.index.str.contains(',')]
musicgraph_missing = musicgraph_missing.select(lambda x: x not in musicgraph_several.index)


First, we will try to fill in the missing genre value with the data acquired from Spotify. To do so, we first have to clean Spotify data, which gives us very specific genres instead of global names such as MusicGenre. Some origin information may also be included in the specific genres, which we should look for carefully.


In [67]:
total_spotify.set_index('name', drop=True, append=False, inplace=True)
del total_spotify.index.name
total_spotify

Unnamed: 0,ambigous_result,genre,no_result,origin
Jeff Mills,0,acid house,0,
Paul Van Dyk,0,disco house,0,
DJ Hell,0,electroclash,0,
Willow,0,float house,0,
Patrick Zigon,0,german techno,0,
Taucher,0,bubble trance,0,
Mando Diao,0,garage rock,0,
Foo Fighters,0,alternative metal,0,
Agnès,0,minimal tech house,0,
Mirko Loko,0,minimal tech house,0,


In [68]:
country_dict['persian'] = 'Iran'
country_dict['breton'] = 'France'
country_dict['argentine'] = 'Argentina'
country_dict['fado'] = 'Portugal'
country_dict['quebecois'] = 'Canada'
country_dict['americana'] = 'United States'
country_dict['j-ambient'] = 'Japan'
country_dict['k-pop'] = 'Korea'
country_dict['uk'] = 'United Kingdom'
country_dict['k-indie'] = 'Korea'
country_dict['j-reggae'] = 'Japan'
country_dict['j-metal'] = 'Japan'
country_dict['j-core'] = 'Japan'
country_dict['j-punk'] = 'Japan'
country_dict['sertanejo'] = 'Brasil'
country_dict['japanoise'] = 'Japan'
country_dict['magyar'] = 'Hungary'
country_dict['j-rock'] = 'Japan'
country_dict['francais'] = 'France'
country_dict['chalga'] = 'Bulgaria'
country_dict['napoletana'] = 'Italy'
country_dict['bhangra'] = 'India'
country_dict['carnatic'] = 'India'
country_dict['forro'] = 'Brasil'
country_dict['entehno'] = 'Greece'
country_dict['bay'] = 'United States'
country_dict['schlager'] = 'Germany'
country_dict['coast'] = 'United States'
country_dict['j-dance'] = 'Japan'
country_dict['k-hop'] = 'Korea'
country_dict['francoton'] = 'France'
country_dict['corsican'] = 'France'
country_dict['british'] = 'United Kingdom'
country_dict['c-pop'] = 'China'


In [69]:
total_spotify = total_spotify.select(lambda x: (x in musicgraph_several.index) or (x in musicgraph_missing.index))
print('Total number of genres from Spotify :', total_spotify.genre.unique().size)
print('Total lines from Spotify with genre :', total_spotify.index.size)

Total number of genres from Spotify : 823
Total lines from Spotify with genre : 3595


In [70]:
print('origins missing before', (pd.isnull(musicgraph_missing.origin)).sum())
print('origins missing before', (pd.isnull(musicgraph_several.origin)).sum())


for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in country_dict :
            if index in musicgraph_missing.index :
                musicgraph_missing.set_value(index, 'origin', country_dict[word])
            elif index in musicgraph_several.index :
                musicgraph_several.set_value(index, 'origin', country_dict[word])
            

print('origins missing after', (pd.isnull(musicgraph_missing.origin)).sum())
print('origins missing after', (pd.isnull(musicgraph_several.origin)).sum())

origins missing before 33757
origins missing before 119
origins missing after 33393
origins missing after 119


In [71]:
genre_dict = {}

# We will have to simplify as we don't have so much time for thorough classification of genres
Electronica = ['house','aggrotech','danspunk', 'brostep', 'abstract', 'chillwave','drone', 'chill', 'beats', 'experimental','electropunk',  'turbo', 'balearic','dance-punk', 'ebm','edm', 'j-dance', 'chillstep','darkpsy', 'darkstep', 'chalga', 'japanoise', 'lounge', 'psytrance', 'tekno','indietronica', 'electronica',  'techno','disco', 'j-ambient',   'noise', 'bass', 'electroclash', 'wave', 'trance', 'ambient', 'dancehall', 'beat', 'dance', 'dub', 'electro', 'eurodance', 'dubstep', 'electronic', 'psych', 'industrial', 'microhouse', 'electrofox', ]
for key in Electronica:
    genre_dict[key] = 'Electronica/Dance'
Rock = ['rock','rock-and-roll','neo-progressive','tribute','post-screamo', 'hardstyle', 'speedcore', 'neo-psychedelic', 'ostrock','neo-rockabilly', 'britpop', 'j-punk','grunge','breakcore', 'goregrind','orgcore','j-rock', 'alternative', 'j-core', 'j-metal', 'k-indie', 'screamocore', 'grindcore', 'nerdcore',  'doomcore', 'sludge',   'core','deathcore',  'gamecore', 'metalcore','post-punk', 'garage','thrash','post-metal', 'psychobilly', 'edge', 'mathcore',  'punk', 'emo', 'indie', 'metal', 'hardcore',  'djent', 'doom', 'glam', 'oi', 'nwobhm']
for key in Rock:
    genre_dict[key] = 'Rock'
Pop = ['pop','popgaze', 'idol','etherpop','anti-folk',  'chanson','c-pop', 'k-pop', 'europop', 'neo-synthpop', 'synthpop', 'folk-pop', 'freak', 'eurovision', 'futurepop']
for key in Pop:
    genre_dict[key] = 'Pop'
Reggae = ['reggae', 'ska', 'reggaeton', 'euroska','j-reggae' ]
for key in Reggae:
    genre_dict[key] = 'Reggae/Ska'
Jazz = ['jazz', 'bebop', 'ragtime', 'afrobeat']
for key in Jazz:
    genre_dict[key] = 'Jazz'
World = ['rai','accordeon', 'entehno',  'african','schlager','corsican','breton', 'asian', 'british',  'arab','armenian', 'kurdish',  'balkan', 'world', 'napoletana','bhangra', 'polka', 'folkmusik', 'andean', 'panpipe', 'maghreb','magyar',  'fado','traditional', 'quebecois', 'carnatic', 'native', 'klezmer', 'world', 'celtic', 'bangla', 'pagode', 'flamenco', 'throat', 'medieval', 'capoeira']
for key in World:
    genre_dict[key] = 'World'
RB = ['r&b', 'funk', 'funky', 'soul']
for key in RB:
    genre_dict[key] = 'Soul/R&B'
Country = ['bluegrass', 'country', 'barbershop', 'americana', 'bluegrass', 'cajun']
for key in Country:
    genre_dict[key] = 'Country'
Latin = ['forro' ,'nu-cumbia', 'sertanejo', 'salsa','tango','merengue', 'bachata', 'rumba', 'nova', 'latin', 'cumbia']
for key in Latin:
    genre_dict[key] = 'Latin'
Rap = ['hop', 'rap', 'trap', 'k-hop', 'francoton']
for key in Rap:
    genre_dict[key] = 'Rap/Hip Hop'
Blues = ['blues', 'blues-rock', 'swing', 'boogie-woogie']
for key in Blues:
    genre_dict[key] = 'Blues'
Classical = ['cello','cappella',  'concert', 'opera', 'choral', 'clarinet', 'classical', 'violin', 'harpsichord', 'string', 'brass', 'orchestral', 'baroque', 'harp', 'early']
for key in Classical:
    genre_dict[key] = 'Classical/Opera'
Soundtracks = ['movie', 'tunes', 'hollywood', 'soundtrack' ]
for key in Soundtracks:
    genre_dict[key] = 'Soundtracks'
Gospel = ['gospel', 'christian', 'liturgical', 'christmas', 'ccm', 'worship']
for key in Gospel:
    genre_dict[key] = 'Christian/Gospel'
NewAge = ['age', 'kirtan', 'didgeridoo']
for key in NewAge:
    genre_dict[key] = 'New Age'

print('genres missing before', (pd.isnull(musicgraph_missing.genre)).sum())
print('genres missing before', (pd.isnull(musicgraph_several.genre)).sum())    
    
for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in genre_dict :
            total_spotify.set_value(index, 'genre', genre_dict[word])
            if index in musicgraph_missing.index :
                musicgraph_missing.set_value(index, 'genre', genre_dict[word])
            elif index in musicgraph_several.index :
                musicgraph_several.set_value(index, 'genre', genre_dict[word])
            
print('genres missing after', (pd.isnull(musicgraph_missing.genre)).sum())
print('genres missing after', (pd.isnull(musicgraph_several.genre)).sum())

genres missing before 28744
genres missing before 118
genres missing after 27478
genres missing after 116


In [72]:
genres = total_musicgraph.genre.unique()
i = 0
for genre in total_spotify.genre :
    if genre not in genres :
        i+=1
print(i/total_spotify.size*100 , '% of artists which subgenre were not parsed')


3.09457579972 % of artists which subgenre were not parsed


In [73]:
print('genres missing before', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing before', (pd.isnull(total_musicgraph.origin)).sum())

for index in musicgraph_missing.index :
    if  (pd.isnull(musicgraph_missing.loc[index].genre)==False) :
        total_musicgraph.set_value(index, 'genre', musicgraph_missing.loc[index].genre)
        if (pd.isnull(musicgraph_missing.loc[index].origin)==False) :
            total_musicgraph.set_value(index, 'origin', musicgraph_missing.loc[index].origin)
            musicgraph_missing.drop(index, inplace=True)
    elif (pd.isnull(musicgraph_missing.loc[index].origin)==False) :
            total_musicgraph.set_value(index, 'origin', musicgraph_missing.loc[index].origin)   
            
            
for index in musicgraph_several.index :
    if (pd.isnull(musicgraph_several.loc[index].genre)==False) :
        total_musicgraph.set_value(index, 'genre', musicgraph_several.loc[index].genre)
        if (pd.isnull(musicgraph_several.loc[index].origin)==False) :
            total_musicgraph.set_value(index, 'origin', musicgraph_several.loc[index].origin)
            musicgraph_several.drop(index, inplace=True)
    elif (pd.isnull(musicgraph_several.loc[index].origin)==False) :
            total_musicgraph.set_value(index, 'origin', musicgraph_several.loc[index].origin)
            
print('genres missing after', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing after', (pd.isnull(total_musicgraph.origin)).sum())

genres missing before 28862
origins missing before 33876
genres missing before 27594
origins missing before 33512


In [534]:
#Intermediary save
total_musicgraph.index.name = 'name'
total_musicgraph.reset_index()
filename = 'total_musicgraph_processed.csv'
pd.DataFrame(total_musicgraph, columns=list(total_musicgraph.columns)).to_csv(filename, index=True, encoding="utf-8")
print('Total processed artists data saved to file')

Total processed artists data saved to file


### Final preprocessing step
#### Events.ch

In [96]:
total_eventsch_artists_wiki = pd.read_csv(os.path.join('./total_eventsch_artists_wiki.csv'))
total_artists_MusicGraph_eventsCH = pd.read_csv(os.path.join('./total_artists_MusicGraph_eventsCH_CYRIL.csv'))

total_eventsch_artists_wiki.set_index('name', drop=True, append=False, inplace=True)
del total_eventsch_artists_wiki.index.name

total_artists_MusicGraph_eventsCH.set_index('name', drop=True, append=False, inplace=True)
del total_artists_MusicGraph_eventsCH.index.name

In [104]:
#Safeguard total_events_artists
total_eventsch_artists = total_eventsch_artists2.copy()
del total_eventsch_artists.index.name

In [106]:
#total_eventsch_artists_wiki
# Wikipedia gets us ~350 origins

print('origins missing before', (pd.isnull(total_eventsch_artists.origin)).sum())
for index in total_eventsch_artists_wiki.index :
        if index in total_eventsch_artists.index :
            if pd.isnull(total_eventsch_artists.loc[index].origin):
                if (pd.isnull(total_eventsch_artists_wiki.loc[index].origin)==False):
                    total_eventsch_artists.set_value(index, 'origin', total_eventsch_artists_wiki.loc[index].origin)
                
print('origins missing after', (pd.isnull(total_eventsch_artists.origin)).sum())

origins missing before 14893
origins missing after 14548


In [107]:
#Drop duplicate rows
total_artists_MusicGraph_eventsCH = total_artists_MusicGraph_eventsCH[~total_artists_MusicGraph_eventsCH.index.duplicated(keep='first')]
print(total_eventsch_artists.index.size)
print(total_artists_MusicGraph_eventsCH.index.size)

16057
16057


In [108]:
#total_artists_MusicGraph_eventsCH
# MusicGraph gets us ~1700 origins 

print('origins missing before', (pd.isnull(total_eventsch_artists.origin)).sum())
for index in total_artists_MusicGraph_eventsCH.index :
        if index in total_eventsch_artists.index :
            if pd.isnull(total_eventsch_artists.loc[index].origin):
                if (pd.isnull(total_artists_MusicGraph_eventsCH.loc[index].origin)==False):
                    total_eventsch_artists.set_value(index, 'origin', total_artists_MusicGraph_eventsCH.loc[index].origin)
                
print('origins missing after', (pd.isnull(total_eventsch_artists.origin)).sum())

origins missing before 14548
origins missing after 12806


In [130]:
#Safeguard total_events
total_events = total_events2.copy()
total_events.index.size

204815

In [132]:
#Update total_events

print('origins missing after', (pd.isnull(total_events.origin)).sum())
print('genres missing after', (pd.isnull(total_events.genre)).sum())


for index, artist in zip(total_events.index, total_events.Artist) :
    if artist in total_eventsch_artists.index :
        if ((pd.isnull(total_events.loc[index].origin)) & (pd.isnull(total_eventsch_artists.loc[artist].origin) == False)) :
            total_events.set_value(index, 'origin', total_eventsch_artists.loc[artist].origin)
        if ((pd.isnull(total_events.loc[index].genre)) & (pd.isnull(total_eventsch_artists.loc[artist].genre) == False)) :
            total_events.set_value(index, 'origin', total_eventsch_artists.loc[artist].genre)
            
print('origins missing after', (pd.isnull(total_events.origin)).sum())
print('genres missing after', (pd.isnull(total_events.genre)).sum())

origins missing after 202063
genres missing after 170200
origins missing after 191068
genres missing after 170200


####  Other artists

In [116]:
total_musicgraph_processed_wiki    = pd.read_csv(os.path.join('./total_musicgraph_processed_wiki.csv'))

#Get Spotify genre data
total_spotify    = pd.read_csv(os.path.join('./total_artists_Spotify_processed_accents.csv'))
total_spotify    = total_spotify.loc[pd.isnull(total_spotify['genre']) == False]
#Get MusicGraph genre and origin data

total_musicgraph_processed_wiki.set_index('name', drop=True, append=False, inplace=True)
del total_musicgraph_processed_wiki.index.name

total_spotify.set_index('name', drop=True, append=False, inplace=True)
del total_spotify.index.name

In [118]:
#Safeguard total_musicgraph
total_musicgraph = total_musicgraph2.copy()
total_musicgraph.index.size

47457

In [120]:
#total_musicgraph_processed_wiki
# Wikipedia gets us 240 genre and ~900 origins

print('genres missing before', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing before', (pd.isnull(total_musicgraph.origin)).sum())
for index in total_musicgraph_processed_wiki.index :
        if index in total_musicgraph.index :
            if pd.isnull(total_musicgraph.loc[index].origin):
                if (pd.isnull(total_musicgraph_processed_wiki.loc[index].origin)==False):
                    total_musicgraph.set_value(index, 'origin', total_musicgraph_processed_wiki.loc[index].origin)
            if pd.isnull(total_musicgraph.loc[index].genre):
                if (pd.isnull(total_musicgraph_processed_wiki.loc[index].genre)==False):
                    total_musicgraph.set_value(index, 'genre', total_musicgraph_processed_wiki.loc[index].genre)
                
print('genres missing before', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing before', (pd.isnull(total_musicgraph.origin)).sum())

genres missing before 27594
origins missing before 32687
genres missing before 27350
origins missing before 32687


#### Spotify

In [125]:
print('origins missing before', (pd.isnull(total_spotify.origin)).sum())

for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in country_dict :
            if pd.isnull(total_spotify.loc[index].origin): 
                total_spotify.set_value(index, 'origin', country_dict[word])            

print('origins missing after', (pd.isnull(total_spotify.origin)).sum())

#clean spotify genre names
print('nb of genres before', total_spotify.genre.unique().size)
    
for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in genre_dict :
            total_spotify.set_value(index, 'genre', genre_dict[word])
            
print('nb of genres after', total_spotify.genre.unique().size)

origins missing before 7156
origins missing after 7143
nb of genres before ['Electronica/Dance' 'Rock' 'Rock, Indie, Punk, Heavy Metal, Gothic' 'Pop'
 'electronica/dance' 'Reggae/Ska' 'pixie' 'Jazz' 'pop' 'World' 'Soul/R&B'
 'Alternative/Indie' 'Latin' 'Rap/Hip Hop' 'blues' 'rap/hip hop'
 'Soundtracks' 'world' 'Vocals' 'Blues' 'jazz' 'cuban rumba' 'rock'
 'Classical/Opera' 'alternative/indie' 'belgian rock' 'ectofolk'
 'alternative dance' "Hip Hop, R'n'B" 'New Age' 'progressive post-hardcore'
 'dance pop' 'Christian/Gospel' 'Country' 'reggae/ska' 'jam band'
 'Seasonal' 'soul/r&b' 'Folk' "Children's" 'Electronic'
 'Comedy/Spoken Word' 'doomcore' 'chanson' 'axe' 'bossa nova'
 'christian/gospel' 'brazilian indie' 'microhouse' 'Instrumental' 'mashup'
 'neo mellow' 'wonky' 'country' 'Jazz, Blues, Soul' 'latin' 'vocaloid'
 'flamenco' 'big room' 'comedy/spoken word' 'bolero' 'adult standards'
 'dwn trap' 'hoerspiel' 'folk' 'Ragga, Reggae, African Music, Dancehall'
 'french indietronica' 'vapo

In [127]:
#Drop duplicate rows
print(total_spotify.index.size)
total_spotify = total_spotify[~total_spotify.index.duplicated(keep='first')]
print(total_spotify.index.size)


20118
20117


In [133]:
#Update total_events with Spotify

print('origins missing before', (pd.isnull(total_events.origin)).sum())
print('genres missing before', (pd.isnull(total_events.genre)).sum())


for index, artist in zip(total_events.index, total_events.Artist) :
    if artist in total_spotify.index :
        if ((pd.isnull(total_events.loc[index].origin)) & (pd.isnull(total_spotify.loc[artist].origin) == False)) :
            total_events.set_value(index, 'origin', total_spotify.loc[artist].origin)
        if ((pd.isnull(total_events.loc[index].genre)) & (pd.isnull(total_spotify.loc[artist].genre) == False)) :
            total_events.set_value(index, 'genre', total_spotify.loc[artist].genre)
            
print('origins missing after', (pd.isnull(total_events.origin)).sum())
print('genres missing after', (pd.isnull(total_events.genre)).sum())

origins missing before 191068
genres missing before 170200
origins missing after 140427
genres missing after 84575


In [136]:
#Update total_events with total_musicgraph

#Update total_events

print('origins missing before', (pd.isnull(total_events.origin)).sum())
print('genres missing before', (pd.isnull(total_events.genre)).sum())


for index, artist in zip(total_events.index, total_events.Artist) :
    if artist in total_musicgraph.index :
        if ((pd.isnull(total_events.loc[index].origin)) & (pd.isnull(total_musicgraph.loc[artist].origin) == False)) :
            total_events.set_value(index, 'origin', total_musicgraph.loc[artist].origin)
        if ((pd.isnull(total_events.loc[index].genre)) & (pd.isnull(total_musicgraph.loc[artist].genre) == False)) :
            total_events.set_value(index, 'genre', total_musicgraph.loc[artist].genre)
            
print('origins missing after', (pd.isnull(total_events.origin)).sum())
print('genres missing after', (pd.isnull(total_events.genre)).sum())

origins missing before 140427
genres missing before 84575
origins missing after 135150
genres missing after 84118


## Cleaning genres and origins names

In [141]:
#Safeguard total_events
total_events = total_events2.copy()
total_events.index.size

204815

In [235]:
total_events.origin = total_events.origin.str.strip()
total_events.replace({'origin': {'United States of America': 'USA', 'U.S.':'USA', 'Hong Kong':'China', 'Arabic': np.nan, 'Alma mater': np.nan,
                                'Congo, The Democratic Republic of the': 'Congo', 'United States':'USA', 'Soviet Union (now Russia)':'Russia',
                                'Alberta': 'Canada', 'Germany / Switzerland' : 'Switzerland', 'West Germany':'Germany', 'US':'USA', 
                                'Czechia':'Czech Republic', ')': np.nan, 'Russian Federation':'Russia', 'Czechoslovakia': 'Czech Republic',
                                'Hesse':'Germany', 'ɑːˈliː/' : 'Switzerland', 'California':'USA', 'Glasperlenspiel' : 'Germany', 'History':'Switzerland',
                                'New York City':'USA', '(age\xa026)':'Ghana', 'British Columbia':'Canada', 'China Television (CTV)':'China',
                                'Surrey': 'UK', 'North London':'UK', 'England / United Kingdom':'UK', 'United Kingdom':'UK', 'Latin Continuum': np.nan,
                                'Ancient Germanic': 'Germany', 'Alabama':'USA', 'Manchester':'UK', 'okeh':'Switzerland', '/ɔːˈɡʌstᵻn/':'Switzerland',
                                'Peter':'Switzerland', 'Greek':'Greece', 'Telesistema Mexicano':'Mexico', 'Massachusetts':'USA', '1998 Nagano':'Switzerland',
                                'West Africa': 'Guinea', 'Japanese':'Japan', 'Quebec':'Canada', 'Native American':'USA', '/ˈpeɪlᵻn/':np.nan,
                                '(age\xa045)':np.nan, 'Soviet Union':'Russia', '1976 Montréal':'Canada', 'Ontario':'Canada','Anathoth':np.nan, 'Wales':'UK',
                                'MTV':np.nan, 'Korea, Republic of':'South Korea', 'Republic of Venice':'Italy', 'Africa':'Ivory Coast', 'Europe':np.nan, 'Bohemia':np.nan,
                                '(unknown)':np.nan, 'Kosovo)':'Kosovo', 'European Union':np.nan, '"':np.nan, 'Ring name(s)':np.nan, 'meaning "man".':np.nan, '2006':'Switzerland',
                                'Okinawa':'Japan', 'Guernsey':'UK', 'Hebrew via Greek and Latin.': np.nan, 'British America':'Canada', 'Hebrew: יוֹסֵף': 'Israel',
                                'Famous warrior':np.nan, '2':np.nan, 'Syndication':np.nan, 'Venezuela, Bolivarian Republic of':'Venezuela', 'North Carolina State University': 'USA',
                                'Democratic Republic of the Congo' : 'Congo', 'Stockholm':'Sweden', 'French':'France', '[citation needed]':'Germany', '[1]':'Switzerland',
                                'Germanic':'Germany', 'Jewish':'Israel', '流逝':'China', '(age\xa050)':np.nan, 'Jersey':'UK', 'Eisenach':'Germany',
                                'Austria-Hungary': 'Austria', 'Ma\'ale Adumim':np.nan, 'Lanarkshire':'UK', 'Sabah':np.nan, 'Sweden and Norway':'Sweden',
                                'Nigeria / Germany':'Nigeria', 'Krasnoyarsk':'Russia', 'North Carolina':'USA', 'CAN':'Canada', 'Rouen':'France', 'Sweden / Europe':'Sweden',
                                'Turkish':'Turkey', 'West London': 'UK', 'South Yorkshire':'UK', 'Κύριλλος (Kyrillos)':'Greece', 'German Empire':'Germany',
                                'Arizona':'USA','Nickelodeon':'USA', 'Persian Empire':'Iran', 'Kingdom of Hungary':'Hungary', 'South America':'Brasil', 
                                'Third Reich':'Germany', 'Oklahoma':'USA', 'Portuguese':'Portugal', 'Illinois - USA' : 'USA', 'Latin and Greek':'Greece',
                                'French Polynesia':'France', 'East Germany':'Germany', 'Vancouver':'Canada', 'Tel Aviv':'Israel', 'Old Norse':'Iceland',
                                'Pennsylvania':'USA', 'Austro-Hungarian Empire':'Austria', 'Indiana':'USA', 'Siam':'Thailand', 'Paris':'France', 'Los Angeles':'USA',
                                'Oregon':'USA', 'South London':'UK', 'Norwegian':'Norway', 'United States of America / Sweden / Finland / Slovenia / Austria':'USA',
                                'United States of America / Canada':'USA','United States of America / Sweden / Finland / Slovenia / Austria':'USA', 'North Africa':'Algeria',
                                'Somalia / United States of America':'Somalia', 'Navajo':'USA', 'Hamburg':'Germany', 'México':'Mexico', 'Côte d\'Ivoire':'Ivory Coast',
                                'Pembury':'UK', 'Réunion':'France', 'SFR Yugoslavia':'Serbia', 'Holland':'Netherlands', 'United States of America / England':'UK',
                                'Tuva / Asia':'Russia', 'Kerala':'India', 'New Jersey':'USA', 'French Cameroon':'Cameroon', 'Prague':'Czech Republic', 'English':'UK',
                                'Gibraltar':'UK', 'Taiwan, Province Of China':'Taiwan', 'Danish':'Danemark', 'Wisconsin':'USA', 'Wales / England':'UK',
                                '.':np.nan, '(age\xa063)':np.nan, '(age\xa035)':np.nan, '(age\xa049)':np.nan, 'D.C.':'USA', 'Curaçao':'Netherlands',
                                'France / Guadeloupe':'France', 'Saint Barthélemy':'France', 'Germany / United States of America':'Germany', 'Italian':'Italy',
                                'Normandy':'France', 'Cape Verde / Portugal':'Cape Verde', 'Northern Ireland':'UK', 'Kingdom of France':'France', 'USSR':'Russia',
                                'Dutch':'Netherlands', 'South Africa / Africa' : 'South Africa', 'Florida':'USA', 'Georgia Russian Empire':'Georgia', 'Siberia':'Russia',
                                'Papal States of Italy':'Italy', 'US.':'USA', 'FL':'USA', 'Berlin\nGerman Empire':'Germany', '':np.nan,
                                'Geneva':'Switzerland', 'Korea':'South Korea', 'Ireland / Czech Republic':'Ireland', 'Yugoslavia':'Serbia',
                                '(age\xa021)':np.nan, 'Canada / United States of America': ' Canada', 'Great Britain':'UK', 'New York': 'USA', 'Texas\nUnited States':'USA',
                                'Bagnols-sur-Cèze\nGard\nFrance':'France', 'Schweizer Fernsehen':'Switzerland', '(Finland)':'Finland', 'Greek via Latin':'Greece',
                                'Austria / Germany':'Austria', 'U.K.':'UK', 'Discovery Channel': np.nan, 'Newfoundland and Labrador':np.nan, 'Brazil / Latin Continuum': 'Brazil',
                                'Newfoundland':'Canada', 'Germany / Poland' : 'Germany', 'Barbados':'Jamaica', 'Zürich':'Switzerland',
                                'Louisiana':'USA', 'Brazil / United States of America':'Brazil', 'meaning \"from France\"':'France',
                                'Canada[1]':'Canada', '\u200e (Mikha\'el)': np.nan,'leader of elves': np.nan,'(age\xa032)': np.nan, 'Antonia':'USA',
                                'Pacifica': np.nan, 'Srivilliputhur': np.nan, 'Sassari': np.nan, 'Sîngerei': np.nan,'Tree of heaven': np.nan, 'holy\"': np.nan,
                                'Die Blechtrommel': 'Germany', '1913': np.nan, '/ˈdʒoʊ.əkɪm/': np.nan, 'Middle East': np.nan, '1997\xa0(1997-02-03)': np.nan,
                                'Zadig ou la Destinée': 'France', 'West Indies': 'UK', '2007': np.nan, '1962': np.nan, 'GMA Network': 'USA', 'Feng County': 'China',
                                'Nepali': 'Nepal', 'British Columbia / Canada':'Canada', 'Germany / England / United States of America': 'Germany', 'Sky Sports F1': 'UK', 
                                 'Televen':'Venezuela', 'Anthony':np.nan, 'Indian Diaspora':'India', 'Slavic':'Russia', 'blossom':np.nan, 'Sahel':'Burkina Faso',
                                '/moʊlˈjɛər/' : np.nan, '691':np.nan, '10 Downing Street':'UK', 'UNITEDSTATESOFAMERICA':'USA', 'United States of America / Denmark':'USA',
                                '/ˈhɒrəs/': np.nan, '1997': np.nan, 'Pacifica': np.nan, '(age\xa038)': np.nan, '/ˈmɔːr/': np.nan, 'Minnesota':'USA', 'ABC':np.nan,
                                'Serbian': 'Serbia', '2016':np.nan, '[3]':np.nan, 'Caazapá': np.nan, 'Italy / United States of America': 'Italy', '[clarification needed]': np.nan,
                                'Electorate of Cologne' : 'Germany', 'Grampian Television': np.nan, 'Heyfield':'UK', 'Scandinavia':'Sweden', 'Caldillo de congrio':np.nan,
                                'England / United States of America': 'England', 'BET':np.nan, 'MTV2':np.nan,'CBeebies':np.nan,'space rock':np.nan,'/ˈtæmərleɪn/':np.nan,
                                'Saxony':'UK', 'Corsica':'France', 'NBC':np.nan, 'April 1976':np.nan, 'U.S.[1]': 'USA', 'Disney Channel Latin America':np.nan, 'Gypsy':'Romania',
                                'Caribbean':np.nan, '(age\xa067)':np.nan, 'DR':np.nan, '1994':np.nan, '(MBA)':np.nan, 'Latin':np.nan, 'shrine':np.nan,
                                'Ayodhya':np.nan, 'September 1993':np.nan, 'September 1979':np.nan, 'Canal J':np.nan, '(1977)':np.nan, 'Latin':np.nan,
                                'E!':np.nan, 'HBO':np.nan, 'Old-Slavic native':np.nan, 'Sky1':np.nan, 'À rebours':np.nan, 'Free State of the Three Leagues':np.nan,
                                '/ˈθeɪliːz/':np.nan, 'Fox':np.nan, 'Life of Pi':np.nan, '\"ruler of the spear\"':np.nan, '(age\xa037)':np.nan, '2005':np.nan, '(age\xa042)':np.nan,
                                'Venezuela / United States of America' : 'Venezuela', '1986 Königssee':'Germany', 'United States of America / Germany':'USA', 'United Kingdoms':'UK',
                                'Texas':'USA', '2012 London':'UK', 'Hebrew':'Israel', '119 AD':np.nan, 'Wollo province Ambassel Region at ‘Egua’':np.nan, 'The Threepenny Opera':np.nan, 'Starz':np.nan,
                                'Sudan / France' : 'Sudan', '(age\xa037)': np.nan, 'Latin America':np.nan, 'Channel 4':np.nan, 'Der Process[1]':np.nan, '(1963)':np.nan,
                                'Sebastian Trüg':np.nan, 'La Planète des Singes':np.nan, '\"Der Sandmann\"':np.nan, 'Missouri':'USA', 'FPR Yugoslavia':'Serbia', '/ˈsɒfəkliːz/':np.nan,
                                'Das Glasperlenspiel': 'Germany', 'Norway / United States of America' :'Norway', 'De Principatibus / Il Principe': np.nan,
                                'Mediterranean':np.nan, 'De Principatibus / Il Principe':np.nan, 'Bongo Soe':np.nan, 'In office':np.nan, 'Maryland': 'USA',
                                'Die Wahlverwandtschaften': np.nan, '3 January 1953':np.nan, 'Bear':np.nan, 'PBS Kids':np.nan, '– February 2014':np.nan,
                                'Le Petit Prince':np.nan, 'October 2004':np.nan, 'BBC HD':np.nan,'(age\xa034)':np.nan, '3 January 1953':np.nan, 
                                'Netherlands Antilles': 'Netherlands', 'United Kingdom / England': 'UK }}, inplace=True)
total_events.replace({'origin': {'Brasil': 'Brazil'}}, inplace=True)
total_events.origin.unique()

array([nan, 'Germany', 'Switzerland', 'UK', 'Mexico', 'Sweden', 'England',
       'USA', 'Hungary', 'Albania', 'Australia', 'Poland', 'Finland',
       'Greece', 'Japan', 'China', 'France', 'Spain', 'Argentina', 'Cuba',
       'Trinidad and Tobago', 'Netherlands', 'Romania', 'Jamaica',
       'Russia', 'Turkey', 'Italy', 'Canada', 'India', 'Belgium',
       'Ukraine', 'Azerbaijan', 'Ireland', 'Slovakia', 'Senegal', 'Congo',
       'Denmark', 'Scotland', 'Czech Republic', 'Austria', 'Lebanon',
       'Iceland', 'New Zealand', 'South Africa', 'Estonia', 'Slovenia',
       'Chile', 'Norway', 'Israel', 'Jordan', 'Paraguay', 'Serbia',
       'Colombia', 'South Korea', 'Georgia', 'Puerto Rico', 'Bangladesh',
       'Ghana', 'Latvia', 'Belarus', 'Guadeloupe', 'Mali',
       'Dominican Republic', 'Peru', 'Brazil', 'Gambia', 'Malaysia',
       'Portugal', 'Luxembourg', 'Guinea', 'Greenland', 'Rwanda',
       'Palestine', 'Fiji', 'Andorra', 'Liechtenstein', 'Cameroon',
       'Croatia', 'Egypt',

In [236]:
for index, origin in zip(total_events.index, total_events.origin):
    if origin == 'Rock, Indie, Punk, Heavy Metal, Gothic':
        total_events.set_value(index, 'genre', origin)
    if origin == 'Hip Hop, R\'n\'B':
        total_events.set_value(index, 'genre', origin)
    if origin == 'Jazz, Blues, Soul':
        total_events.set_value(index, 'genre', origin)
    if origin == 'Ragga, Reggae, African Music, Dancehall':
        total_events.set_value(index, 'genre', origin)        
    if origin == 'Electronic':
        total_events.set_value(index, 'genre', origin)        
    if origin == 'Pop':
        total_events.set_value(index, 'genre', origin)          
        
total_events.replace({'origin': {'Rock, Indie, Punk, Heavy Metal, Gothic': np.nan, 'Hip Hop, R\'n\'B':np.nan, 'Jazz, Blues, Soul':np.nan,
                                  'Ragga, Reggae, African Music, Dancehall':np.nan, 'Electronic' : np.nan, 'Pop':np.nan}}, inplace=True)
total_events.origin.unique()

array([nan, 'Germany', 'Switzerland', 'UK', 'Mexico', 'Sweden', 'England',
       'USA', 'Hungary', 'Albania', 'Australia', 'Poland', 'Finland',
       'Greece', 'Japan', 'China', 'France', 'Spain', 'Argentina', 'Cuba',
       'Trinidad and Tobago', 'Netherlands', 'Romania', 'Jamaica',
       'Russia', 'Turkey', 'Italy', 'Canada', 'India', 'Belgium',
       'Ukraine', 'Azerbaijan', 'Ireland', 'Slovakia', 'Senegal', 'Congo',
       'Denmark', 'Scotland', 'Czech Republic', 'Austria', 'Lebanon',
       'Iceland', 'New Zealand', 'South Africa', 'Estonia', 'Slovenia',
       'Chile', 'Norway', 'Israel', 'Jordan', 'Paraguay', 'Serbia',
       'Colombia', 'South Korea', 'Georgia', 'Puerto Rico', 'Bangladesh',
       'Ghana', 'Latvia', 'Belarus', 'Guadeloupe', 'Mali',
       'Dominican Republic', 'Peru', 'Brazil', 'Gambia', 'Malaysia',
       'Portugal', 'Luxembourg', 'Guinea', 'Greenland', 'Rwanda',
       'Palestine', 'Fiji', 'Andorra', 'Liechtenstein', 'Cameroon',
       'Croatia', 'Egypt',

In [249]:
total_events.replace({'genre': {'Hip Hop, R\'n\'B' :'Rap/Hip Hop',
                                'Classic' : 'Classical',
                               "Hip Hop, R\'n\'B"  :'Rap/Hip Hop', 'Ragga, Reggae, African Music, Dancehall' : 'Reggae/Ska',
                               'Jazz, Blues, Soul': 'Jazz/Blues', 'Rock, Indie, Punk, Heavy Metal, Gothic': 'Rock',
                               'Electronic' : 'Electro',  'Comedy/Spoken Word' : 'Other',
                               'Alternative/Indie' : 'Rock', 'Electronica/Dance' : 'Electro', 
                               'pixie' : 'Electro', 'electronica/dance' : 'Electro', 'alternative/indie' : 'Rock',
                               'folk' : 'Folk', 'World' : 'Folk', 'Soul/R&B' : 'Jazz/Blues',
                                "Children's" : 'Other', 'christian/gospel' : 'Devotional', 'Christian/Gospel' : 'Devotional',
                               'Classical/Opera': 'Classical', 'Metalcore': 'Rock', 'Hip hop':'Rap/Hip Hop', 'lithumania':'Other',
                                'Instrumental' : 'Jazz', 'New Age' : 'Devotional', 'big room' : 'Electro', 'Vocals':'Jazz', 'groove room':'Electro',
                                'discofox' : 'Electro', 'Seasonal':'Other', 'deep big room' : 'Electro', 'beatdown' : 'Electro', 'commons' : 'Other',
                                'c64': 'Other', 'Progressive rock': 'Rock', 'tracestep' : 'Electro', 'Rock Metal' : 'Rock', 'comedy' : 'Other',
                               'Noise rock': 'Rock', 'Tribute act' : 'Rock', 'reggae/ska' :'Reggae/Ska', 'coupe decale' :'Other',
                               'voidgaze' : 'Electro', 'Soundtracks' : 'Other', 'Industrial' : 'Electro', 'kabarett' : 'Other', 'bmore' : 'Other',
                               'Americana' : 'Country', 'stomp and whittle' : 'Other', 'Electro house' : 'Electro', 'ukulele' : 'Folk',
                               'oshare kei' : 'Other', 'mashup' : 'Electro', 'lo star' : 'Electro', 'EMI Music' : 'Electro', 'catstep' : 'Electro', 'fake' : 'Other',
                                'classical/opera': 'Classical', 'Experimental' : 'Electro', 'consort' : 'Other', 'Operatic pop' : 'Pop',
                               'demoscene' : 'Other', 'soul/r&b' : 'Jazz/Blues', 'Electronica' : 'Electro', 'Ethio-jazz': 'Jazz/Blues', 'Chali 2na' : 'Other',
                                  'mandible' : 'Other', 'comic':'Other', 'Concord Jazz' : 'Jazz/Blues', 'Kel tamashek':'Other',
                               'synthwave' : 'Electro', 'Indie pop' : 'Pop', 'vapor twitch' : 'Electro', 'Art rock' : 'Rock', 'neo mellow':'Other',
                               "children's" :'Other', 'Hard rock' : 'Rock', 'Rapcore' : 'Rap/Hip Hop', 'bassline' : 'Electro', 'Post-hardcore' : 'Rock',
                              'moombahton' :'Other', 'Technical death metal': 'Rock', 'www.willeandthebandits.com' :'Other',
                                'necrogrind': 'Rock', 'Indie rock': 'Rock', 'pop' : 'Pop', 'Heavy metal': 'Rock', 'karneval':'Other',
                                'Avant-Garde' : 'Electro', 'Son' :'Other', 'Rap' : 'Rap/Hip Hop', 'Alternative rock' : 'Rock',
                                'scratch' : 'Electro', 'Folk metal' : 'Rock', 'healing' : 'Devotional', 'mpb':'Other', 'jam band':'Other', 'zapstep' : 'Electro',
                                'Dance-punk' : 'Electro', 'kizomba' : 'Pop', 'Hardcore punk': 'Rock', 'Garage rock': 'Rock',
                                'Lounge music' : 'Electro', 'World Tropical' : 'Folk', 'African hip hop' : 'Rap/Hip Hop', 'jump up' :'Other',
                                'Punk rock': 'Rock', 'Cast Recordings/Cabaret' :'Other', 'Genres' :'Other',
                                'Psychedelic folk' : 'Folk', 'cellist': 'Classical', 'Drone metal': 'Rock', 'Melodic death metal': 'Rock',
                                'hauntology' : 'Electro', 'mod revival' :'Other', 'Alternative hip hop' : 'Rap/Hip Hop', 'blaskapelle':'Other',
                                'Post-punk': 'Rock', 'sega' : 'Electro', 'electroacoustic improvisation' : 'Jazz/Blues', 'House' : 'Electro',
                                'comedy/spoken word':'Other', 'classical music': 'Classical', 'azontobeats' : 'Electro',
                                'instrumental': 'Jazz/Blues', 'christelijk' :'Other', 'austropop' : 'Pop', 'Alternative metal': 'Rock',
                                'musical theatre':'Other', 'grime':'Other', 'Disco' : 'Electro', 'Psychedelic rock': 'Rock',
                                'Thrash metal': 'Rock', 'pipe band' : 'Folk', 'Celtic punk': 'Rock', 'timba' : 'Folk', 'arabesk' : 'Folk',
                                'hatecore' : 'Rock', 'liedermacher' : 'Classical', 'Alt-country' : 'Country', 'hoerspiel' :'Other',
                                'dark cabaret':'Other', 'Funk' : 'Jazz/Blues', 'axe':'Other', 'neue deutsche welle':'Other', 'Film scores':'Classical',
                                'adult standards':'Other', 'Trip hop':'Rap/Hip Hop', 'Rumpelstilz':'Other', 'deep full on':'Other',
                                'Galant music':'Other', 'kuduro':'Folk', 'full on':'Other', 'Electropop': 'Pop', 'cabaret':'Other',
                                'World Hindustani':'Folk', 'contemporary post-bop':'Jazz/Blues', 'alt-pop':'Pop',
                                'Indie music': 'Rock', 'europop':'Pop', 'patriciakopatchinskaja.com':'Other',
                                'Shock rock': 'Rock', 'New Weird America':'Country', 'Death metal': 'Rock', 'French hip hop' : 'Rap/Hip Hop',
                                'http://www.cillianvallely.com':'Other', 'http://www.bricecatherin.org':'Other',
                                'folk rock': 'Rock', 'Desert rock': 'Rock', 'new jack smooth':'Other', 'Irish folk' : 'Folk',
                                'Singer-songwriter' : 'Folk', 'escape room':'Other', 'accordion' : 'Folk', 'Reggae' : 'Ska/Reggae',
                                'deep freestyle' : 'Electro', 'Glam rock': 'Rock', 'Psychobilly': 'Rock', 'ectofolk' : 'Pop',
                                'modern downshift' : 'Electro', 'Electric Light Orchestra' : 'Electro', 'chamame' :'Other',
                                'big band' : 'Classical', 'Southern rock': 'Rock', 'new tribe' : 'Electro', 'neo honky tonk':'Other', 'Indie': 'Rock',
                                'soundtracks':'Other', 'louvor':'Other', 'Neo soul' : 'Jazz/Blues', 'aussietronica' : 'Electro', 'austindie': 'Rock',
                                'Mundart':'Other', 'fluxwork':'Other', 'Electric blues' : 'Jazz/Blues', 'Afrobeat' : 'Pop', 'bolero':'Folk',
                                'http://www.saez.mu':'Other', 'acousmatic':'Electro', 'Electronic rock': 'Rock', 'Ranchera':'Country',
                                'Symphonic black metal': 'Rock', 'Folk rock': 'Rock', 'kindermusik':'Other',
                                'Ethiopian music' : 'Folk', 'reading':'Other', 'motown':'Other', 'Neo-progressive rock': 'Rock',
                                'seasonal':'Other', 'breakbeat' : 'Electro', 'neurofunk' : 'Jazz/Blues', 'Eurodance' : 'Electro', 'deep liquid' : 'Electro',
                                'Latin pop' : 'Latin', 'Indie folk' : 'Folk', 'French house' : 'Electro',
                                'progressive post-hardcore' : 'Rock', 'vocaloid':'Other', 'Rhythm and blues' : 'Jazz/Blues',
                                'fallen angel':'Other', 'New wave' : 'Electro', 'schranz':'Other', 'footwork':'Other', 'crunk':'Other',
                                'Detroit techno' : 'Electro', 'gabba':'Other', 'house' : 'Electro', 'ambeat' : 'Electro', 'Dubstep' : 'Electro',
                                'sampler' : 'Electro', 'wonky':'Other' }}, inplace=True)


total_events.replace({'genre': {'Jazz' : 'Jazz/Blues', 'Blues':'Jazz/Blues', 'Ska/Reggae':'Reggae/Ska' }}, inplace=True)

In [250]:
total_events.genre.unique()

array(['Rap/Hip Hop', 'Reggae/Ska', 'Jazz/Blues', 'Rock', 'Pop', 'Electro',
       'Classical', nan, 'Other', 'Country', 'Folk', 'Latin', 'Devotional'], dtype=object)

In [228]:
total_events.loc[total_events['origin']=='Gypsy'] 

Unnamed: 0,Adress,Artist,City,Date,Latitude,Longitude,Venue,genre,origin
61700,,The Gypsies,Geneve,2016-03-17,46.209614,6.150724,THEATRE DU LEMAN,World,Gypsy
68658,,Taraf de Haïdouks,Vevey,2014-04-03,46.472145,6.851925,ROCKING CHAIR,World,Gypsy
169269,,TARAF DE HAIDOUKS,Nyon,2006-07-18,,,PALEO FESTIVAL,World,Gypsy
169271,,KOCANI ORKESTAR,Nyon,2006-07-18,,,PALEO FESTIVAL,World,Gypsy
169294,,KOCANI ORKESTAR,Nyon,2006-07-19,,,PALEO FESTIVAL,World,Gypsy


In [251]:
#Write the DataFrame to a csv file
filename = 'total_events_preprocessed.csv'
pd.DataFrame(total_events, columns=list(total_events.columns)).to_csv(filename, index=False, encoding="utf-8")
print('Total events data saved to file')


Total events data saved to file


In [190]:
np.nan

nan