# Preprocessing Artist data
In this notebook, second step on our preprocessing pipeline, we'll extract artists from our events dataset, and then try to fill missing values of either musical genre or country of origin, with the help of other web platforms. Obviously we may not succeed in obtaining 100% of it, but the more the better.

- MusicGraph
- Spotify
- Wikipedia

In [41]:
import pandas as pd
import os
import glob
import urllib
import requests
import time
import json
from pandas.io.json import json_normalize
from IPython.display import clear_output
import numpy as np
import bandsInTownHelper as bandsInTownHelper

import pycountry
import Scripts.country_demonyms

## Genres and origins

Divide the data we have after calling the MusicGraph API into two subsets : one which has value filled in nicely (~35%) which  we'll call clean, another one with 'assumed' correct artist names but missing genre and origin information, and a third where information is missing, and rows may contain more than one artist in their name. The last subset may require extra handling care with regard to the events frame.
BIG PART of exploratory data analysis

In [6]:
# Get total artists set after attempting to update genres and origins through MusicGraph API
total_musicgraph = pd.read_csv(os.path.join('Artists/total_artists_MusicGraph.csv'))


# Get genre data acquired from Spotify, discard rows for which nothing was found
total_spotify    = pd.read_csv(os.path.join('Artists/total_artists_Spotify.csv'))
total_spotify    = total_spotify.loc[pd.isnull(total_spotify['genre']) == False]

# Get original Events.ch set to delete duplicates in the total_musicgraph set
# and the specific artists set from Events.ch
total_eventsch_artist_del = pd.read_csv(os.path.join('Artists/total_eventsch.csv'))
total_eventsch_artists = pd.read_csv(os.path.join('Artists/total_eventsch_artists.csv'))

print('Number of entries in the total_musicgraph set', total_musicgraph.index.size)
print('Number of entries in the total_eventsch set', total_eventsch_artists.index.size)

Number of entries in the total_musicgraph set 62000
Number of entries in the total_eventsch set 16057


### Events.ch

In [7]:
# In the following, we will set the Artist names as index for simplicity
# Prepare deletion of duplicates
total_eventsch_artist_del.drop(['Date'], 1, inplace=True)
total_eventsch_artist_del.drop(['Genre'], 1, inplace=True)
total_eventsch_artist_del.drop(['Venue'], 1, inplace=True)
total_eventsch_artist_del.drop(['City'], 1, inplace=True)
total_eventsch_artist_del.drop_duplicates(['Artist'], inplace=True)
total_eventsch_artist_del.set_index('Artist', drop=True, append=False, inplace=True)
del total_eventsch_artist_del.index.name

# Set artist name as index for total_musicgraph set
total_musicgraph.set_index('name', drop=True, append=False, inplace=True)
del total_musicgraph.index.name

# Drop duplicates
for index in total_eventsch_artist_del.index :
        if index in total_musicgraph.index:
            total_musicgraph.drop(index, inplace=True)

# Set artist name as index for the total_eventsch_artists set
total_eventsch_artists.set_index('name', drop=True, append=False, inplace=True)
del total_eventsch_artists.index.name

print('Number of entries in the total_musicgraph set after deleting duplicates from Events.ch artists', total_musicgraph.index.size)

Number of entries in the total_musicgraph set after deleting duplicates from Events.ch artists 47457


In [11]:
print('Missing genres  :',(pd.isnull(total_musicgraph.genre).sum())+(pd.isnull(total_eventsch_artists.genre).sum()))
print('Missing origins :',(pd.isnull(total_musicgraph.origin).sum())+(pd.isnull(total_eventsch_artists.origin).sum()))

Missing genres  : 29769
Missing origins : 48949


#### Filling genres and available origins for artists parsed from Events.ch also present in the total_musicgraph set

In [12]:
# We iterate over the artists we parsed from the Events.ch data to see if they also appear
# in the rest of the artists set, in which case we can update their genre if needed since
# Events.ch has genre for every artist.

# Obviously if an artist from events.ch is found in the total_musicgraph set, then we will
# update total_musicgraph and remove that artist from total_eventsch as it would become 
# redundant in the end.

print('genres missing before', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing before', (pd.isnull(total_musicgraph.origin)).sum())
print('rows in eventsch before', total_eventsch_artists.index.size)
i=0
for index in total_eventsch_artists.index :
    if index in total_musicgraph.index:
        if pd.isnull(total_musicgraph.loc[index].genre):
            total_musicgraph.set_value(index, 'genre', total_eventsch_artists.loc[index].genre)

        if ((pd.isnull(total_eventsch_artists.loc[index].origin) == False) and (pd.isnull(total_musicgraph.loc[index].origin))) :
            total_musicgraph.set_value(index, 'origin', total_eventsch_artists.loc[index].origin)
                     
        total_eventsch_artists.drop(index, inplace=True)    

print('genres missing after', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing after', (pd.isnull(total_musicgraph.origin)).sum())
print('rows in eventsch after', total_eventsch_artists.index.size)

genres missing before 29769
origins missing before 34056
rows in eventsch before 16057
genres missing after 29769
origins missing after 34056
rows in eventsch after 16057


In [None]:
# Export unique artists from Events.ch for further processing with the music intelligence platforms
total_eventsch_artists.index.name = 'name'
total_eventsch_artists.reset_index()
#Write the DataFrame to a csv file
filename = 'Artists/total_eventsch_artists.csv'
pd.DataFrame(total_eventsch_artists, columns=list(total_eventsch_artists.columns)).to_csv(filename, index=True, encoding="utf-8")
print('Total events data saved to file')

### Rest of the artists set

In [13]:
# Separate missing rows from filled ones
musicgraph_missing = total_musicgraph.loc[(pd.isnull(total_musicgraph.genre) | pd.isnull(total_musicgraph.origin))]

#### Using Spotify
First, we will try to fill in the missing genre values with data acquired from Spotify. To do so, we first have to clean Spotify data, which gives us very specific genres instead of global names such as MusicGenre. Because the complexity of our classification is bound by the simpler model, we have to drop the specificity of Spotify. Some origin information may also be included in the specific genres, which we should look for carefully before simplifying them.

In [14]:
total_spotify.set_index('name', drop=True, append=False, inplace=True)
del total_spotify.index.name
total_spotify.head(10)

Unnamed: 0,ambigous_result,genre,no_result,origin
Jeff Mills,0,acid house,0,
Paul Van Dyk,0,disco house,0,
DJ Hell,0,electroclash,0,
Willow,0,float house,0,
Patrick Zigon,0,german techno,0,
Taucher,0,bubble trance,0,
Mando Diao,0,garage rock,0,
Foo Fighters,0,alternative metal,0,
Agnès,0,minimal tech house,0,
Mirko Loko,0,minimal tech house,0,


In [15]:
#Create a dict of Country adjective to Country name
country_dict = {}
for key, value in country_demonyms.COUNTRY_DEMONYMS.items():
    country_dict[value.lower()] = key.lower().title()

country_name = []
country_alpha2 = []
country_alpha3 = []
for country in list(pycountry.countries) :
    if ' ' not in country.name :
        country_name.append(country.name)
    country_alpha2.append(country.alpha_2)
    country_alpha3.append(country.alpha_3)
country_alpha2.remove('DJ')
country_alpha2.remove('MC') 

# Add specific words grabbed through exploratory analysis
country_dict['persian'] = 'Iran'
country_dict['breton'] = 'France'
country_dict['argentine'] = 'Argentina'
country_dict['fado'] = 'Portugal'
country_dict['quebecois'] = 'Canada'
country_dict['americana'] = 'United States'
country_dict['j-ambient'] = 'Japan'
country_dict['k-pop'] = 'Korea'
country_dict['uk'] = 'United Kingdom'
country_dict['k-indie'] = 'Korea'
country_dict['j-reggae'] = 'Japan'
country_dict['j-metal'] = 'Japan'
country_dict['j-core'] = 'Japan'
country_dict['j-punk'] = 'Japan'
country_dict['sertanejo'] = 'Brasil'
country_dict['japanoise'] = 'Japan'
country_dict['magyar'] = 'Hungary'
country_dict['j-rock'] = 'Japan'
country_dict['francais'] = 'France'
country_dict['chalga'] = 'Bulgaria'
country_dict['napoletana'] = 'Italy'
country_dict['bhangra'] = 'India'
country_dict['carnatic'] = 'India'
country_dict['forro'] = 'Brasil'
country_dict['entehno'] = 'Greece'
country_dict['bay'] = 'United States'
country_dict['schlager'] = 'Germany'
country_dict['coast'] = 'United States'
country_dict['j-dance'] = 'Japan'
country_dict['k-hop'] = 'Korea'
country_dict['francoton'] = 'France'
country_dict['corsican'] = 'France'
country_dict['british'] = 'United Kingdom'
country_dict['c-pop'] = 'China'
country_dict['schweizer'] = 'Switzerland'

In [16]:
total_spotify = total_spotify.select(lambda x: x in musicgraph_missing.index)
print('Total number of genres from Spotify :', total_spotify.genre.unique().size)
print('Total helpful lines from Spotify with genre :', total_spotify.index.size)

Total number of genres from Spotify : 837
Total helpful lines from Spotify with genre : 3754


#### Parsing and updating origin information
We get roughly 400 origins more

In [17]:
print('origins missing before', (pd.isnull(musicgraph_missing.origin)).sum())

for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in country_dict :
            if index in musicgraph_missing.index :
                musicgraph_missing.set_value(index, 'origin', country_dict[word])

print('origins missing after', (pd.isnull(musicgraph_missing.origin)).sum())

origins missing before 34056
origins missing after 33680


#### Simplifying genres
We identified a set of words from exploratory analysis to accelerate the process of replacement.

In [18]:
genre_dict = {}

Electronica = ['house','aggrotech','danspunk', 'brostep', 'abstract', 'chillwave','drone', 'chill', 'beats', 'experimental','electropunk',  'turbo', 'balearic','dance-punk', 'ebm','edm', 'j-dance', 'chillstep','darkpsy', 'darkstep', 'chalga', 'japanoise', 'lounge', 'psytrance', 'tekno','indietronica', 'electronica',  'techno','disco', 'j-ambient',   'noise', 'bass', 'electroclash', 'wave', 'trance', 'ambient', 'dancehall', 'beat', 'dance', 'dub', 'electro', 'eurodance', 'dubstep', 'electronic', 'psych', 'industrial', 'microhouse', 'electrofox', ]
for key in Electronica:
    genre_dict[key] = 'Electronica/Dance'
Rock = ['rock','rock-and-roll','neo-progressive','tribute','post-screamo', 'hardstyle', 'speedcore', 'neo-psychedelic', 'ostrock','neo-rockabilly', 'britpop', 'j-punk','grunge','breakcore', 'goregrind','orgcore','j-rock', 'alternative', 'j-core', 'j-metal', 'k-indie', 'screamocore', 'grindcore', 'nerdcore',  'doomcore', 'sludge',   'core','deathcore',  'gamecore', 'metalcore','post-punk', 'garage','thrash','post-metal', 'psychobilly', 'edge', 'mathcore',  'punk', 'emo', 'indie', 'metal', 'hardcore',  'djent', 'doom', 'glam', 'oi', 'nwobhm']
for key in Rock:
    genre_dict[key] = 'Rock'
Pop = ['pop','popgaze', 'idol','etherpop','anti-folk',  'chanson','c-pop', 'k-pop', 'europop', 'neo-synthpop', 'synthpop', 'folk-pop', 'freak', 'eurovision', 'futurepop']
for key in Pop:
    genre_dict[key] = 'Pop'
Reggae = ['reggae', 'ska', 'reggaeton', 'euroska','j-reggae' ]
for key in Reggae:
    genre_dict[key] = 'Reggae/Ska'
Jazz = ['jazz', 'bebop', 'ragtime', 'afrobeat']
for key in Jazz:
    genre_dict[key] = 'Jazz'
World = ['rai','accordeon', 'entehno',  'african','schlager','corsican','breton', 'asian', 'british',  'arab','armenian', 'kurdish',  'balkan', 'world', 'napoletana','bhangra', 'polka', 'folkmusik', 'andean', 'panpipe', 'maghreb','magyar',  'fado','traditional', 'quebecois', 'carnatic', 'native', 'klezmer', 'world', 'celtic', 'bangla', 'pagode', 'flamenco', 'throat', 'medieval', 'capoeira']
for key in World:
    genre_dict[key] = 'World'
RB = ['r&b', 'funk', 'funky', 'soul']
for key in RB:
    genre_dict[key] = 'Soul/R&B'
Country = ['bluegrass', 'country', 'barbershop', 'americana', 'bluegrass', 'cajun']
for key in Country:
    genre_dict[key] = 'Country'
Latin = ['forro' ,'nu-cumbia', 'sertanejo', 'salsa','tango','merengue', 'bachata', 'rumba', 'nova', 'latin', 'cumbia']
for key in Latin:
    genre_dict[key] = 'Latin'
Rap = ['hop', 'rap', 'trap', 'k-hop', 'francoton']
for key in Rap:
    genre_dict[key] = 'Rap/Hip Hop'
Blues = ['blues', 'blues-rock', 'swing', 'boogie-woogie']
for key in Blues:
    genre_dict[key] = 'Blues'
Classical = ['cello','cappella',  'concert', 'opera', 'choral', 'clarinet', 'classical', 'violin', 'harpsichord', 'string', 'brass', 'orchestral', 'baroque', 'harp', 'early']
for key in Classical:
    genre_dict[key] = 'Classical/Opera'
Soundtracks = ['movie', 'tunes', 'hollywood', 'soundtrack' ]
for key in Soundtracks:
    genre_dict[key] = 'Soundtracks'
Gospel = ['gospel', 'christian', 'liturgical', 'christmas', 'ccm', 'worship']
for key in Gospel:
    genre_dict[key] = 'Christian/Gospel'
NewAge = ['age', 'kirtan', 'didgeridoo']
for key in NewAge:
    genre_dict[key] = 'New Age'

####  Updating genres

In [19]:
print('genres missing before', (pd.isnull(musicgraph_missing.genre)).sum())
    
for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in genre_dict :
            total_spotify.set_value(index, 'genre', genre_dict[word])
            if index in musicgraph_missing.index :
                musicgraph_missing.set_value(index, 'genre', genre_dict[word])

print('genres missing after', (pd.isnull(musicgraph_missing.genre)).sum())

genres missing before 29769
genres missing after 28372


#### Updating the total_musicgraph

In [20]:
print('genres missing before', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing before', (pd.isnull(total_musicgraph.origin)).sum())

for index in musicgraph_missing.index :
    if  (pd.isnull(musicgraph_missing.loc[index].genre)==False) :
        total_musicgraph.set_value(index, 'genre', musicgraph_missing.loc[index].genre)
        if (pd.isnull(musicgraph_missing.loc[index].origin)==False) :
            total_musicgraph.set_value(index, 'origin', musicgraph_missing.loc[index].origin)
            musicgraph_missing.drop(index, inplace=True)
    elif (pd.isnull(musicgraph_missing.loc[index].origin)==False) :
            total_musicgraph.set_value(index, 'origin', musicgraph_missing.loc[index].origin)   

print('genres missing after', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing after', (pd.isnull(total_musicgraph.origin)).sum())

genres missing before 29769
origins missing before 34056


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


genres missing after 28372
origins missing after 33680


### Second data updating step

In [21]:
# Load Wikipedia data and MusicGraph data obtained for Events.ch artists, and set indices
total_eventsch_artists_wiki = pd.read_csv(os.path.join('Artists/total_eventsch_artists_wiki.csv'))
total_artists_MusicGraph_eventsCH = pd.read_csv(os.path.join('Artists/total_eventsch_artists_MusicGraph.csv'))

total_eventsch_artists_wiki.set_index('name', drop=True, append=False, inplace=True)
del total_eventsch_artists_wiki.index.name

total_artists_MusicGraph_eventsCH.set_index('name', drop=True, append=False, inplace=True)
del total_artists_MusicGraph_eventsCH.index.name

#### Update origins of Events.ch artists with Wikipedia data
We get a mere 350 more origins after scraping Wikipedia

In [22]:
print('origins missing before', (pd.isnull(total_eventsch_artists.origin)).sum())
for index in total_eventsch_artists_wiki.index :
        if index in total_eventsch_artists.index :
            if pd.isnull(total_eventsch_artists.loc[index].origin):
                if (pd.isnull(total_eventsch_artists_wiki.loc[index].origin)==False):
                    total_eventsch_artists.set_value(index, 'origin', total_eventsch_artists_wiki.loc[index].origin)
                
print('origins missing after', (pd.isnull(total_eventsch_artists.origin)).sum())

origins missing before 14893
origins missing after 14548


In [23]:
# Maintnance : drop duplicate rows
total_artists_MusicGraph_eventsCH = total_artists_MusicGraph_eventsCH[~total_artists_MusicGraph_eventsCH.index.duplicated(keep='first')]
print(total_eventsch_artists.index.size)
print(total_artists_MusicGraph_eventsCH.index.size)

16057
16057


#### Update origins of Events.ch artists with MusicGraph data
MusicGraph gets ~us 1700 more origins

In [24]:
#total_artists_MusicGraph_eventsCH
# MusicGraph gets us ~1700 origins 

print('origins missing before', (pd.isnull(total_eventsch_artists.origin)).sum())
for index in total_artists_MusicGraph_eventsCH.index :
        if index in total_eventsch_artists.index :
            if pd.isnull(total_eventsch_artists.loc[index].origin):
                if (pd.isnull(total_artists_MusicGraph_eventsCH.loc[index].origin)==False):
                    total_eventsch_artists.set_value(index, 'origin', total_artists_MusicGraph_eventsCH.loc[index].origin)
                
print('origins missing after', (pd.isnull(total_eventsch_artists.origin)).sum())

origins missing before 14548
origins missing after 12806


### Updating total_events with Events.ch artists' informations

In [30]:
# Loading total_events
total_events = pd.read_csv(os.path.join('Events/total_events.csv'))
total_events.rename(columns={'Genre': 'genre'}, inplace=True)

In [31]:
#Update total_events with Events.ch artist data

print('origins missing before', (pd.isnull(total_events.origin)).sum())
print('genres missing before', (pd.isnull(total_events.genre)).sum())


for index, artist in zip(total_events.index, total_events.Artist) :
    if artist in total_eventsch_artists.index :
        if ((pd.isnull(total_events.loc[index].origin)) & (pd.isnull(total_eventsch_artists.loc[artist].origin) == False)) :
            total_events.set_value(index, 'origin', total_eventsch_artists.loc[artist].origin)
        if ((pd.isnull(total_events.loc[index].genre)) & (pd.isnull(total_eventsch_artists.loc[artist].genre) == False)) :
            total_events.set_value(index, 'origin', total_eventsch_artists.loc[artist].genre)
            
print('origins missing after', (pd.isnull(total_events.origin)).sum())
print('genres missing after', (pd.isnull(total_events.genre)).sum())

origins missing before 202062
genres missing before 170200
origins missing after 191068
genres missing after 170200


### Other artists

In [32]:
# Load Wikipedia data
total_musicgraph_processed_wiki    = pd.read_csv(os.path.join('Artists/total_musicgraph_processed_wiki.csv'))

# Get new Spotify genre data after some minor fixes
total_spotify    = pd.read_csv(os.path.join('Artists/total_artists_Spotify_processed_accents.csv'))
total_spotify    = total_spotify.loc[pd.isnull(total_spotify['genre']) == False]

# Set indices
total_musicgraph_processed_wiki.set_index('name', drop=True, append=False, inplace=True)
del total_musicgraph_processed_wiki.index.name

total_spotify.set_index('name', drop=True, append=False, inplace=True)
del total_spotify.index.name

#### Updating total_musicgraph with Wikipedia data

In [33]:
# Wikipedia gets us 240 genre and ~900 origins

print('genres missing before', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing before', (pd.isnull(total_musicgraph.origin)).sum())
for index in total_musicgraph_processed_wiki.index :
        if index in total_musicgraph.index :
            if pd.isnull(total_musicgraph.loc[index].origin):
                if (pd.isnull(total_musicgraph_processed_wiki.loc[index].origin)==False):
                    total_musicgraph.set_value(index, 'origin', total_musicgraph_processed_wiki.loc[index].origin)
            if pd.isnull(total_musicgraph.loc[index].genre):
                if (pd.isnull(total_musicgraph_processed_wiki.loc[index].genre)==False):
                    total_musicgraph.set_value(index, 'genre', total_musicgraph_processed_wiki.loc[index].genre)
                
print('genres missing after', (pd.isnull(total_musicgraph.genre)).sum())
print('origins missing after', (pd.isnull(total_musicgraph.origin)).sum())

genres missing before 28372
origins missing before 33680
genres missing before 27350
origins missing before 32687


#### Updating total_musicgraph with Spotify data

In [34]:
print('origins missing before', (pd.isnull(total_spotify.origin)).sum())

# Extract origins likewise
for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in country_dict :
            if pd.isnull(total_spotify.loc[index].origin): 
                total_spotify.set_value(index, 'origin', country_dict[word])            

print('origins missing after', (pd.isnull(total_spotify.origin)).sum())

# Clean spotify genre names
print('nb of genres before', total_spotify.genre.unique().size)
    
for index, genre in zip(total_spotify.index, total_spotify.genre) :
    for word in genre.split() :
        if word in genre_dict :
            total_spotify.set_value(index, 'genre', genre_dict[word])
            
print('nb of genres after', total_spotify.genre.unique().size)

origins missing before 7156
origins missing after 7143
nb of genres before 192
nb of genres after 136


In [35]:
# Maintenance : drop duplicate rows
print(total_spotify.index.size)
total_spotify = total_spotify[~total_spotify.index.duplicated(keep='first')]
print(total_spotify.index.size)


20118
20117


### Updating total_events with Spotify

In [36]:
print('origins missing before', (pd.isnull(total_events.origin)).sum())
print('genres missing before', (pd.isnull(total_events.genre)).sum())


for index, artist in zip(total_events.index, total_events.Artist) :
    if artist in total_spotify.index :
        if ((pd.isnull(total_events.loc[index].origin)) & (pd.isnull(total_spotify.loc[artist].origin) == False)) :
            total_events.set_value(index, 'origin', total_spotify.loc[artist].origin)
        if ((pd.isnull(total_events.loc[index].genre)) & (pd.isnull(total_spotify.loc[artist].genre) == False)) :
            total_events.set_value(index, 'genre', total_spotify.loc[artist].genre)
            
print('origins missing after', (pd.isnull(total_events.origin)).sum())
print('genres missing after', (pd.isnull(total_events.genre)).sum())

origins missing before 191068
genres missing before 170200
origins missing after 140427
genres missing after 84575


### Updating total_events with MusicGraph

In [37]:
print('origins missing before', (pd.isnull(total_events.origin)).sum())
print('genres missing before', (pd.isnull(total_events.genre)).sum())


for index, artist in zip(total_events.index, total_events.Artist) :
    if artist in total_musicgraph.index :
        if ((pd.isnull(total_events.loc[index].origin)) & (pd.isnull(total_musicgraph.loc[artist].origin) == False)) :
            total_events.set_value(index, 'origin', total_musicgraph.loc[artist].origin)
        if ((pd.isnull(total_events.loc[index].genre)) & (pd.isnull(total_musicgraph.loc[artist].genre) == False)) :
            total_events.set_value(index, 'genre', total_musicgraph.loc[artist].genre)
            
print('origins missing after', (pd.isnull(total_events.origin)).sum())
print('genres missing after', (pd.isnull(total_events.genre)).sum())

origins missing before 140427
genres missing before 84575
origins missing after 135150
genres missing after 84118


## Final step : cleaning up genres and origins names
Because of the diversity of sources used to gather and enrich data, we end up with a messy categorization of artists' genres and origins. Therefore, this maintenance step is dramatically important for the coherence of our analysis and visualisations.

#### Origins cleaning
After a bit of exploration, we identify the origin names that divert from the country names we're after (most often scrapping errors from Wikipedia) and either match them to country names, or simply ditch them.

In [38]:
total_events.origin = total_events.origin.str.strip()
total_events.replace({'origin': {'United States of America': 'USA', 'U.S.':'USA', 'Hong Kong':'China', 'Arabic': np.nan, 'Alma mater': np.nan,
                                'Congo, The Democratic Republic of the': 'Congo', 'United States':'USA', 'Soviet Union (now Russia)':'Russia',
                                'Alberta': 'Canada', 'Germany / Switzerland' : 'Switzerland', 'West Germany':'Germany', 'US':'USA', 
                                'Czechia':'Czech Republic', ')': np.nan, 'Russian Federation':'Russia', 'Czechoslovakia': 'Czech Republic',
                                'Hesse':'Germany', 'ɑːˈliː/' : 'Switzerland', 'California':'USA', 'Glasperlenspiel' : 'Germany', 'History':'Switzerland',
                                'New York City':'USA', '(age\xa026)':'Ghana', 'British Columbia':'Canada', 'China Television (CTV)':'China',
                                'Surrey': 'UK', 'North London':'UK', 'England / United Kingdom':'UK', 'United Kingdom':'UK', 'Latin Continuum': np.nan,
                                'Ancient Germanic': 'Germany', 'Alabama':'USA', 'Manchester':'UK', 'okeh':'Switzerland', '/ɔːˈɡʌstᵻn/':'Switzerland',
                                'Peter':'Switzerland', 'Greek':'Greece', 'Telesistema Mexicano':'Mexico', 'Massachusetts':'USA', '1998 Nagano':'Switzerland',
                                'West Africa': 'Guinea', 'Japanese':'Japan', 'Quebec':'Canada', 'Native American':'USA', '/ˈpeɪlᵻn/':np.nan,
                                '(age\xa045)':np.nan, 'Soviet Union':'Russia', '1976 Montréal':'Canada', 'Ontario':'Canada','Anathoth':np.nan, 'Wales':'UK',
                                'MTV':np.nan, 'Korea, Republic of':'South Korea', 'Republic of Venice':'Italy', 'Africa':'Ivory Coast', 'Europe':np.nan, 'Bohemia':np.nan,
                                '(unknown)':np.nan, 'Kosovo)':'Kosovo', 'European Union':np.nan, '"':np.nan, 'Ring name(s)':np.nan, 'meaning "man".':np.nan, '2006':'Switzerland',
                                'Okinawa':'Japan', 'Guernsey':'UK', 'Hebrew via Greek and Latin.': np.nan, 'British America':'Canada', 'Hebrew: יוֹסֵף': 'Israel',
                                'Famous warrior':np.nan, '2':np.nan, 'Syndication':np.nan, 'Venezuela, Bolivarian Republic of':'Venezuela', 'North Carolina State University': 'USA',
                                'Democratic Republic of the Congo' : 'Congo', 'Stockholm':'Sweden', 'French':'France', '[citation needed]':'Germany', '[1]':'Switzerland',
                                'Germanic':'Germany', 'Jewish':'Israel', '流逝':'China', '(age\xa050)':np.nan, 'Jersey':'UK', 'Eisenach':'Germany',
                                'Austria-Hungary': 'Austria', 'Ma\'ale Adumim':np.nan, 'Lanarkshire':'UK', 'Sabah':np.nan, 'Sweden and Norway':'Sweden',
                                'Nigeria / Germany':'Nigeria', 'Krasnoyarsk':'Russia', 'North Carolina':'USA', 'CAN':'Canada', 'Rouen':'France', 'Sweden / Europe':'Sweden',
                                'Turkish':'Turkey', 'West London': 'UK', 'South Yorkshire':'UK', 'Κύριλλος (Kyrillos)':'Greece', 'German Empire':'Germany',
                                'Arizona':'USA','Nickelodeon':'USA', 'Persian Empire':'Iran', 'Kingdom of Hungary':'Hungary', 'South America':'Brasil', 
                                'Third Reich':'Germany', 'Oklahoma':'USA', 'Portuguese':'Portugal', 'Illinois - USA' : 'USA', 'Latin and Greek':'Greece',
                                'French Polynesia':'France', 'East Germany':'Germany', 'Vancouver':'Canada', 'Tel Aviv':'Israel', 'Old Norse':'Iceland',
                                'Pennsylvania':'USA', 'Austro-Hungarian Empire':'Austria', 'Indiana':'USA', 'Siam':'Thailand', 'Paris':'France', 'Los Angeles':'USA',
                                'Oregon':'USA', 'South London':'UK', 'Norwegian':'Norway', 'United States of America / Sweden / Finland / Slovenia / Austria':'USA',
                                'United States of America / Canada':'USA','United States of America / Sweden / Finland / Slovenia / Austria':'USA', 'North Africa':'Algeria',
                                'Somalia / United States of America':'Somalia', 'Navajo':'USA', 'Hamburg':'Germany', 'México':'Mexico', 'Côte d\'Ivoire':'Ivory Coast',
                                'Pembury':'UK', 'Réunion':'France', 'SFR Yugoslavia':'Serbia', 'Holland':'Netherlands', 'United States of America / England':'UK',
                                'Tuva / Asia':'Russia', 'Kerala':'India', 'New Jersey':'USA', 'French Cameroon':'Cameroon', 'Prague':'Czech Republic', 'English':'UK',
                                'Gibraltar':'UK', 'Taiwan, Province Of China':'Taiwan', 'Danish':'Danemark', 'Wisconsin':'USA', 'Wales / England':'UK',
                                '.':np.nan, '(age\xa063)':np.nan, '(age\xa035)':np.nan, '(age\xa049)':np.nan, 'D.C.':'USA', 'Curaçao':'Netherlands',
                                'France / Guadeloupe':'France', 'Saint Barthélemy':'France', 'Germany / United States of America':'Germany', 'Italian':'Italy',
                                'Normandy':'France', 'Cape Verde / Portugal':'Cape Verde', 'Northern Ireland':'UK', 'Kingdom of France':'France', 'USSR':'Russia',
                                'Dutch':'Netherlands', 'South Africa / Africa' : 'South Africa', 'Florida':'USA', 'Georgia Russian Empire':'Georgia', 'Siberia':'Russia',
                                'Papal States of Italy':'Italy', 'US.':'USA', 'FL':'USA', 'Berlin\nGerman Empire':'Germany', '':np.nan,
                                'Geneva':'Switzerland', 'Korea':'South Korea', 'Ireland / Czech Republic':'Ireland', 'Yugoslavia':'Serbia',
                                '(age\xa021)':np.nan, 'Canada / United States of America': ' Canada', 'Great Britain':'UK', 'New York': 'USA', 'Texas\nUnited States':'USA',
                                'Bagnols-sur-Cèze\nGard\nFrance':'France', 'Schweizer Fernsehen':'Switzerland', '(Finland)':'Finland', 'Greek via Latin':'Greece',
                                'Austria / Germany':'Austria', 'U.K.':'UK', 'Discovery Channel': np.nan, 'Newfoundland and Labrador':np.nan, 'Brazil / Latin Continuum': 'Brazil',
                                'Newfoundland':'Canada', 'Germany / Poland' : 'Germany', 'Barbados':'Jamaica', 'Zürich':'Switzerland',
                                'Louisiana':'USA', 'Brazil / United States of America':'Brazil', 'meaning \"from France\"':'France',
                                'Canada[1]':'Canada', '\u200e (Mikha\'el)': np.nan,'leader of elves': np.nan,'(age\xa032)': np.nan, 'Antonia':'USA',
                                'Pacifica': np.nan, 'Srivilliputhur': np.nan, 'Sassari': np.nan, 'Sîngerei': np.nan,'Tree of heaven': np.nan, 'holy\"': np.nan,
                                'Die Blechtrommel': 'Germany', '1913': np.nan, '/ˈdʒoʊ.əkɪm/': np.nan, 'Middle East': np.nan, '1997\xa0(1997-02-03)': np.nan,
                                'Zadig ou la Destinée': 'France', 'West Indies': 'UK', '2007': np.nan, '1962': np.nan, 'GMA Network': 'USA', 'Feng County': 'China',
                                'Nepali': 'Nepal', 'British Columbia / Canada':'Canada', 'Germany / England / United States of America': 'Germany', 'Sky Sports F1': 'UK', 
                                 'Televen':'Venezuela', 'Anthony':np.nan, 'Indian Diaspora':'India', 'Slavic':'Russia', 'blossom':np.nan, 'Sahel':'Burkina Faso',
                                '/moʊlˈjɛər/' : np.nan, '691':np.nan, '10 Downing Street':'UK', 'UNITEDSTATESOFAMERICA':'USA', 'United States of America / Denmark':'USA',
                                '/ˈhɒrəs/': np.nan, '1997': np.nan, 'Pacifica': np.nan, '(age\xa038)': np.nan, '/ˈmɔːr/': np.nan, 'Minnesota':'USA', 'ABC':np.nan,
                                'Serbian': 'Serbia', '2016':np.nan, '[3]':np.nan, 'Caazapá': np.nan, 'Italy / United States of America': 'Italy', '[clarification needed]': np.nan,
                                'Electorate of Cologne' : 'Germany', 'Grampian Television': np.nan, 'Heyfield':'UK', 'Scandinavia':'Sweden', 'Caldillo de congrio':np.nan,
                                'England / United States of America': 'England', 'BET':np.nan, 'MTV2':np.nan,'CBeebies':np.nan,'space rock':np.nan,'/ˈtæmərleɪn/':np.nan,
                                'Saxony':'UK', 'Corsica':'France', 'NBC':np.nan, 'April 1976':np.nan, 'U.S.[1]': 'USA', 'Disney Channel Latin America':np.nan, 'Gypsy':'Romania',
                                'Caribbean':np.nan, '(age\xa067)':np.nan, 'DR':np.nan, '1994':np.nan, '(MBA)':np.nan, 'Latin':np.nan, 'shrine':np.nan,
                                'Ayodhya':np.nan, 'September 1993':np.nan, 'September 1979':np.nan, 'Canal J':np.nan, '(1977)':np.nan, 'Latin':np.nan,
                                'E!':np.nan, 'HBO':np.nan, 'Old-Slavic native':np.nan, 'Sky1':np.nan, 'À rebours':np.nan, 'Free State of the Three Leagues':np.nan,
                                '/ˈθeɪliːz/':np.nan, 'Fox':np.nan, 'Life of Pi':np.nan, '\"ruler of the spear\"':np.nan, '(age\xa037)':np.nan, '2005':np.nan, '(age\xa042)':np.nan,
                                'Venezuela / United States of America' : 'Venezuela', '1986 Königssee':'Germany', 'United States of America / Germany':'USA', 'United Kingdoms':'UK',
                                'Texas':'USA', '2012 London':'UK', 'Hebrew':'Israel', '119 AD':np.nan, 'Wollo province Ambassel Region at ‘Egua’':np.nan, 'The Threepenny Opera':np.nan, 'Starz':np.nan,
                                'Sudan / France' : 'Sudan', '(age\xa037)': np.nan, 'Latin America':np.nan, 'Channel 4':np.nan, 'Der Process[1]':np.nan, '(1963)':np.nan,
                                'Sebastian Trüg':np.nan, 'La Planète des Singes':np.nan, '\"Der Sandmann\"':np.nan, 'Missouri':'USA', 'FPR Yugoslavia':'Serbia', '/ˈsɒfəkliːz/':np.nan,
                                'Das Glasperlenspiel': 'Germany', 'Norway / United States of America' :'Norway', 'De Principatibus / Il Principe': np.nan,
                                'Mediterranean':np.nan, 'De Principatibus / Il Principe':np.nan, 'Bongo Soe':np.nan, 'In office':np.nan, 'Maryland': 'USA',
                                'Die Wahlverwandtschaften': np.nan, '3 January 1953':np.nan, 'Bear':np.nan, 'PBS Kids':np.nan, '– February 2014':np.nan,
                                'Le Petit Prince':np.nan, 'October 2004':np.nan, 'BBC HD':np.nan,'(age\xa034)':np.nan, '3 January 1953':np.nan, 
                                'Netherlands Antilles': 'Netherlands', 'United Kingdom / England': 'UK' }}, inplace=True)

# Done separately to avoid conflicts
total_events.replace({'origin': {'Brasil': 'Brazil', 'England': 'UK'}}, inplace=True)
                              
total_events.origin.unique()

array([nan, 'Germany', 'USA', 'Italy', 'UK', 'Switzerland', 'Sweden',
       'Jamaica', 'France', 'Egypt', 'Canada', 'New Zealand', 'Colombia',
       'Finland', 'Mali', 'Australia', 'Belgium', 'Greece', 'Argentina',
       'Russia', 'Brazil', 'Iceland', 'South Africa', 'Cuba', 'Benin',
       'Senegal', 'Guinea', 'Spain', 'India', 'Poland', 'Netherlands',
       'Romania', 'Serbia', 'Hungary', 'Macedonia', 'Malaysia', 'Chile',
       'Mexico', 'Belarus', 'Israel', 'Japan',
       'Rock, Indie, Punk, Heavy Metal, Gothic', 'Denmark', 'Indonesia',
       'Austria', 'Ireland', 'Burkina Faso', 'Tunisia', 'Bahia', 'Pop',
       'Scotland', "Hip Hop, R'n'B", 'Jazz, Blues, Soul', 'Algeria',
       'Ivory Coast', 'Niger', 'Norway', 'Croatia', 'Taiwan', 'Portugal',
       'Cape Verde', 'Czech Republic', 'Cameroon', 'South Korea',
       'Lithuania', 'Nigeria', 'Ragga, Reggae, African Music, Dancehall',
       'Turkey', 'Lebanon', 'Slovenia', 'Bangladesh', 'Latvia',
       'Faroe Islands', 'Esto

Fixing a minor issue where genre was set as origin

In [39]:
for index, origin in zip(total_events.index, total_events.origin):
    if origin == 'Rock, Indie, Punk, Heavy Metal, Gothic':
        total_events.set_value(index, 'genre', origin)
    if origin == 'Hip Hop, R\'n\'B':
        total_events.set_value(index, 'genre', origin)
    if origin == 'Jazz, Blues, Soul':
        total_events.set_value(index, 'genre', origin)
    if origin == 'Ragga, Reggae, African Music, Dancehall':
        total_events.set_value(index, 'genre', origin)        
    if origin == 'Electronic':
        total_events.set_value(index, 'genre', origin)        
    if origin == 'Pop':
        total_events.set_value(index, 'genre', origin)          
        
total_events.replace({'origin': {'Rock, Indie, Punk, Heavy Metal, Gothic': np.nan, 'Hip Hop, R\'n\'B':np.nan, 'Jazz, Blues, Soul':np.nan,
                                  'Ragga, Reggae, African Music, Dancehall':np.nan, 'Electronic' : np.nan, 'Pop':np.nan}}, inplace=True)
total_events.origin.unique()

array([nan, 'Germany', 'USA', 'Italy', 'UK', 'Switzerland', 'Sweden',
       'Jamaica', 'France', 'Egypt', 'Canada', 'New Zealand', 'Colombia',
       'Finland', 'Mali', 'Australia', 'Belgium', 'Greece', 'Argentina',
       'Russia', 'Brazil', 'Iceland', 'South Africa', 'Cuba', 'Benin',
       'Senegal', 'Guinea', 'Spain', 'India', 'Poland', 'Netherlands',
       'Romania', 'Serbia', 'Hungary', 'Macedonia', 'Malaysia', 'Chile',
       'Mexico', 'Belarus', 'Israel', 'Japan', 'Denmark', 'Indonesia',
       'Austria', 'Ireland', 'Burkina Faso', 'Tunisia', 'Bahia',
       'Scotland', 'Algeria', 'Ivory Coast', 'Niger', 'Norway', 'Croatia',
       'Taiwan', 'Portugal', 'Cape Verde', 'Czech Republic', 'Cameroon',
       'South Korea', 'Lithuania', 'Nigeria', 'Turkey', 'Lebanon',
       'Slovenia', 'Bangladesh', 'Latvia', 'Faroe Islands', 'Estonia',
       'Puerto Rico', 'Zaire', 'Peru', 'China', 'Moldova', 'Angola',
       'Philippines', 'Morocco', 'Ukraine', 'Andorra', 'Venezuela',
       'S

#### Genre cleaning
We proceed the same way to get artists genre down to the simple yet most complex achievable categorization, given the simplest of our sources

In [40]:
total_events.replace({'genre': {'Hip Hop, R\'n\'B' :'Rap/Hip Hop',
                                'Classic' : 'Classical',
                               "Hip Hop, R\'n\'B"  :'Rap/Hip Hop', 'Ragga, Reggae, African Music, Dancehall' : 'Reggae/Ska',
                               'Jazz, Blues, Soul': 'Jazz/Blues', 'Rock, Indie, Punk, Heavy Metal, Gothic': 'Rock',
                               'Electronic' : 'Electro',  'Comedy/Spoken Word' : 'Other',
                               'Alternative/Indie' : 'Rock', 'Electronica/Dance' : 'Electro', 
                               'pixie' : 'Electro', 'electronica/dance' : 'Electro', 'alternative/indie' : 'Rock',
                               'folk' : 'Folk', 'World' : 'Folk', 'Soul/R&B' : 'Jazz/Blues',
                                "Children's" : 'Other', 'christian/gospel' : 'Devotional', 'Christian/Gospel' : 'Devotional',
                               'Classical/Opera': 'Classical', 'Metalcore': 'Rock', 'Hip hop':'Rap/Hip Hop', 'lithumania':'Other',
                                'Instrumental' : 'Jazz', 'New Age' : 'Devotional', 'big room' : 'Electro', 'Vocals':'Jazz', 'groove room':'Electro',
                                'discofox' : 'Electro', 'Seasonal':'Other', 'deep big room' : 'Electro', 'beatdown' : 'Electro', 'commons' : 'Other',
                                'c64': 'Other', 'Progressive rock': 'Rock', 'tracestep' : 'Electro', 'Rock Metal' : 'Rock', 'comedy' : 'Other',
                               'Noise rock': 'Rock', 'Tribute act' : 'Rock', 'reggae/ska' :'Reggae/Ska', 'coupe decale' :'Other',
                               'voidgaze' : 'Electro', 'Soundtracks' : 'Other', 'Industrial' : 'Electro', 'kabarett' : 'Other', 'bmore' : 'Other',
                               'Americana' : 'Country', 'stomp and whittle' : 'Other', 'Electro house' : 'Electro', 'ukulele' : 'Folk',
                               'oshare kei' : 'Other', 'mashup' : 'Electro', 'lo star' : 'Electro', 'EMI Music' : 'Electro', 'catstep' : 'Electro', 'fake' : 'Other',
                                'classical/opera': 'Classical', 'Experimental' : 'Electro', 'consort' : 'Other', 'Operatic pop' : 'Pop',
                               'demoscene' : 'Other', 'soul/r&b' : 'Jazz/Blues', 'Electronica' : 'Electro', 'Ethio-jazz': 'Jazz/Blues', 'Chali 2na' : 'Other',
                                  'mandible' : 'Other', 'comic':'Other', 'Concord Jazz' : 'Jazz/Blues', 'Kel tamashek':'Other',
                               'synthwave' : 'Electro', 'Indie pop' : 'Pop', 'vapor twitch' : 'Electro', 'Art rock' : 'Rock', 'neo mellow':'Other',
                               "children's" :'Other', 'Hard rock' : 'Rock', 'Rapcore' : 'Rap/Hip Hop', 'bassline' : 'Electro', 'Post-hardcore' : 'Rock',
                              'moombahton' :'Other', 'Technical death metal': 'Rock', 'www.willeandthebandits.com' :'Other',
                                'necrogrind': 'Rock', 'Indie rock': 'Rock', 'pop' : 'Pop', 'Heavy metal': 'Rock', 'karneval':'Other',
                                'Avant-Garde' : 'Electro', 'Son' :'Other', 'Rap' : 'Rap/Hip Hop', 'Alternative rock' : 'Rock',
                                'scratch' : 'Electro', 'Folk metal' : 'Rock', 'healing' : 'Devotional', 'mpb':'Other', 'jam band':'Other', 'zapstep' : 'Electro',
                                'Dance-punk' : 'Electro', 'kizomba' : 'Pop', 'Hardcore punk': 'Rock', 'Garage rock': 'Rock',
                                'Lounge music' : 'Electro', 'World Tropical' : 'Folk', 'African hip hop' : 'Rap/Hip Hop', 'jump up' :'Other',
                                'Punk rock': 'Rock', 'Cast Recordings/Cabaret' :'Other', 'Genres' :'Other',
                                'Psychedelic folk' : 'Folk', 'cellist': 'Classical', 'Drone metal': 'Rock', 'Melodic death metal': 'Rock',
                                'hauntology' : 'Electro', 'mod revival' :'Other', 'Alternative hip hop' : 'Rap/Hip Hop', 'blaskapelle':'Other',
                                'Post-punk': 'Rock', 'sega' : 'Electro', 'electroacoustic improvisation' : 'Jazz/Blues', 'House' : 'Electro',
                                'comedy/spoken word':'Other', 'classical music': 'Classical', 'azontobeats' : 'Electro',
                                'instrumental': 'Jazz/Blues', 'christelijk' :'Other', 'austropop' : 'Pop', 'Alternative metal': 'Rock',
                                'musical theatre':'Other', 'grime':'Other', 'Disco' : 'Electro', 'Psychedelic rock': 'Rock',
                                'Thrash metal': 'Rock', 'pipe band' : 'Folk', 'Celtic punk': 'Rock', 'timba' : 'Folk', 'arabesk' : 'Folk',
                                'hatecore' : 'Rock', 'liedermacher' : 'Classical', 'Alt-country' : 'Country', 'hoerspiel' :'Other',
                                'dark cabaret':'Other', 'Funk' : 'Jazz/Blues', 'axe':'Other', 'neue deutsche welle':'Other', 'Film scores':'Classical',
                                'adult standards':'Other', 'Trip hop':'Rap/Hip Hop', 'Rumpelstilz':'Other', 'deep full on':'Other',
                                'Galant music':'Other', 'kuduro':'Folk', 'full on':'Other', 'Electropop': 'Pop', 'cabaret':'Other',
                                'World Hindustani':'Folk', 'contemporary post-bop':'Jazz/Blues', 'alt-pop':'Pop',
                                'Indie music': 'Rock', 'europop':'Pop', 'patriciakopatchinskaja.com':'Other',
                                'Shock rock': 'Rock', 'New Weird America':'Country', 'Death metal': 'Rock', 'French hip hop' : 'Rap/Hip Hop',
                                'http://www.cillianvallely.com':'Other', 'http://www.bricecatherin.org':'Other',
                                'folk rock': 'Rock', 'Desert rock': 'Rock', 'new jack smooth':'Other', 'Irish folk' : 'Folk',
                                'Singer-songwriter' : 'Folk', 'escape room':'Other', 'accordion' : 'Folk', 'Reggae' : 'Ska/Reggae',
                                'deep freestyle' : 'Electro', 'Glam rock': 'Rock', 'Psychobilly': 'Rock', 'ectofolk' : 'Pop',
                                'modern downshift' : 'Electro', 'Electric Light Orchestra' : 'Electro', 'chamame' :'Other',
                                'big band' : 'Classical', 'Southern rock': 'Rock', 'new tribe' : 'Electro', 'neo honky tonk':'Other', 'Indie': 'Rock',
                                'soundtracks':'Other', 'louvor':'Other', 'Neo soul' : 'Jazz/Blues', 'aussietronica' : 'Electro', 'austindie': 'Rock',
                                'Mundart':'Other', 'fluxwork':'Other', 'Electric blues' : 'Jazz/Blues', 'Afrobeat' : 'Pop', 'bolero':'Folk',
                                'http://www.saez.mu':'Other', 'acousmatic':'Electro', 'Electronic rock': 'Rock', 'Ranchera':'Country',
                                'Symphonic black metal': 'Rock', 'Folk rock': 'Rock', 'kindermusik':'Other',
                                'Ethiopian music' : 'Folk', 'reading':'Other', 'motown':'Other', 'Neo-progressive rock': 'Rock',
                                'seasonal':'Other', 'breakbeat' : 'Electro', 'neurofunk' : 'Jazz/Blues', 'Eurodance' : 'Electro', 'deep liquid' : 'Electro',
                                'Latin pop' : 'Latin', 'Indie folk' : 'Folk', 'French house' : 'Electro',
                                'progressive post-hardcore' : 'Rock', 'vocaloid':'Other', 'Rhythm and blues' : 'Jazz/Blues',
                                'fallen angel':'Other', 'New wave' : 'Electro', 'schranz':'Other', 'footwork':'Other', 'crunk':'Other',
                                'Detroit techno' : 'Electro', 'gabba':'Other', 'house' : 'Electro', 'ambeat' : 'Electro', 'Dubstep' : 'Electro',
                                'sampler' : 'Electro', 'wonky':'Other' }}, inplace=True)

# Done separately to avoid conflicts
total_events.replace({'genre': {'Jazz' : 'Jazz/Blues', 'Blues':'Jazz/Blues', 'Ska/Reggae':'Reggae/Ska' }}, inplace=True)
total_events.replace({'genre': {'Country' : 'Folk', 'Devotional':'Folk', 'Latin':'Folk' }}, inplace=True)

total_events.genre.unique()

array(['Electro', nan, 'Rock', 'Pop', 'Reggae/Ska', 'Jazz/Blues', 'Folk',
       'Rap/Hip Hop', 'Other', 'Classical'], dtype=object)

### Exporting the whole updated events dataset
If we need insight on artist data later on, it will be easy to extract from the event dataset.

In [None]:
#Write the DataFrame to a csv file
filename = 'Events/total_events_preprocessed.csv'
pd.DataFrame(total_events, columns=list(total_events.columns)).to_csv(filename, index=False, encoding="utf-8")
print('Total events data saved to file')