# Preprocessing pipeline
In this notebook, we will clean and agglomerate the data acquired from several platforms to be used conveniently for analysis.

In [75]:
import pandas as pd
import os
import glob
import urllib
import requests
import time
import json
from pandas.io.json import json_normalize
from IPython.display import clear_output
import numpy as np
import bandsInTownHelper as bandsInTownHelper

In [135]:
#Load the total_x events data into several DataFrame.

total_eventsch = pd.read_csv(os.path.join('./total_eventsch.csv'))
total_bands_in_town = pd.read_csv(os.path.join('./total_bands_in_town.csv'))
total_residentadvisor = pd.read_csv(os.path.join('./total_residentadvisor.csv'))
total_routedesfestivals = pd.read_csv(os.path.join('./total_routedesfestivals.csv'))

### Unifying the representation
The data gathered on several platforms came in a handful of shapes that we should now normalize while retaining as much information. We will discards platforms only'ids.

In [136]:
total_bands_in_town.drop(['artist_url', 'event_id', 'event_url', 'event_venue.region', 'event_venue.url', 'event_venue.id'], 1, inplace=True)

#Convert time column to datetime objects
total_bands_in_town['event_datetime'] = pd.to_datetime(total_bands_in_town['event_datetime'])
#Remove time from dates
total_bands_in_town['event_datetime'] = total_bands_in_town['event_datetime'].apply( lambda x: x.date() )
total_bands_in_town.head(10)

Unnamed: 0,artist_name,event_datetime,event_venue.city,event_venue.latitude,event_venue.longitude,event_venue.name
0,Groombridge,2006-01-06,Langenthal,47.21206,7.789998,Rock in Church
1,Painhead,2006-01-07,Rorschach,47.477928,9.49519,Hafenbuffet
2,shEver,2006-01-14,Zug,47.18222,8.52076,Industrie 45
3,Painhead,2006-01-15,Gossau (Sankt Gallen),47.414415,9.25495,The Office
4,Mando Diao,2006-01-21,Laax,46.8,9.25,PALACE CLUB AT RIDERS PALACE
5,Foo Fighters,2006-01-25,Winterthur,47.495655,8.74848,Eishalle Duetwag
6,Groombridge,2006-01-27,Burgdorf,47.05,7.616667,Gymfest
7,Groombridge,2007-01-01,Berne,46.948432,7.440461,ONO
8,Painhead,2007-01-06,Sommeri,47.566667,9.283333,Löwenarena
9,shEver,2007-01-13,Zurich,47.38662,8.53438,Werk21


### Venues location
Bands in Town data associates coordinates to each venue, which could be interesting for representing our findings on maps. We will extract those in a separate DataFrame, and then drop the coordinates from the events DataFrame. For simplicity, we will leave in the venue and city columns.

In [137]:
#Surely, better to drop following coordinates, as venue names are not cleaned (sometimes venue name is X @ Y)
venues = total_bands_in_town[['event_venue.name', 'event_venue.city', 'event_venue.latitude', 'event_venue.longitude']].copy()
total_bands_in_town.drop(['event_venue.latitude', 'event_venue.longitude'], 1, inplace=True)
venues.drop_duplicates(subset=['event_venue.longitude', 'event_venue.latitude'], inplace=True)
venues.drop_duplicates('event_venue.name', inplace=True)
venues.set_index('event_venue.name', drop=True, append=False, inplace=True)
del venues.index.name
venues = venues.sort_index()

In [138]:
venues_ra = total_residentadvisor[['club_name', 'club_adress']].copy()
venues_ra.drop_duplicates('club_name', inplace=True)
venues_ra.set_index('club_name', drop=True, append=False, inplace=True)
del venues_ra.index.name
venues_ra

Unnamed: 0,club_adress
2. Akt Restaurant & Bar,Selnaustrasse 2
25 Hours Hotel Zürich,"Pfingstweidstrasse 102, 8005 Zürich"
2B Lounge,"Nüschelerstrasse 31, 8001 Zürich"
3monkeys,"Alexander-Schönistrasse 17, 2502, Biel/Bienne"
4. Akt,"Heinrichstrasse 262, 8005 Zürich"
5ème Etage,"Mühlenplatz 11, Bern"
Abraxas,"Chemin du Stand 5; Pully, 1009"
Acanto,"Pfingstweidstrasse 6, 8005 Zürich"
Acapella Bar,"Place de la Gare, 1957 Ardon, Valais, Suisse"
Acqua Lounge Basilea,"Binningerstrasse 14; 4051, Basel"


In [139]:
venues_total = pd.concat([venues, venues_ra])
#for i in venues.index :
#    print(i)
    #if i in venues_ra.index :
        
    

In [140]:
total_bands_in_town.columns = ['Artist', 'Date', 'City', 'Venue']
total_bands_in_town

Unnamed: 0,Artist,Date,City,Venue
0,Groombridge,2006-01-06,Langenthal,Rock in Church
1,Painhead,2006-01-07,Rorschach,Hafenbuffet
2,shEver,2006-01-14,Zug,Industrie 45
3,Painhead,2006-01-15,Gossau (Sankt Gallen),The Office
4,Mando Diao,2006-01-21,Laax,PALACE CLUB AT RIDERS PALACE
5,Foo Fighters,2006-01-25,Winterthur,Eishalle Duetwag
6,Groombridge,2006-01-27,Burgdorf,Gymfest
7,Groombridge,2007-01-01,Berne,ONO
8,Painhead,2007-01-06,Sommeri,Löwenarena
9,shEver,2007-01-13,Zurich,Werk21


In [141]:
#RouteDesFestivals
#make the three time columns into a single date column

total_routedesfestivals.month.unique()
for i, month in zip(total_routedesfestivals.index, total_routedesfestivals.month) :
    if month == 'Jan.':
        total_routedesfestivals.set_value(i, 'month', 1.0)
    if month == 'Fev.':
        total_routedesfestivals.set_value(i, 'month', 2.0)
    if month == 'Mar.':
        total_routedesfestivals.set_value(i, 'month', 3.0)
    if month == 'Avr.':
        total_routedesfestivals.set_value(i, 'month', 4.0)    
    if month == 'Mai':
        total_routedesfestivals.set_value(i, 'month', 5.0)   
    if month == 'Juin':
        total_routedesfestivals.set_value(i, 'month', 6.0)      
    if month == 'Juil.':
        total_routedesfestivals.set_value(i, 'month', 7.0)
    if month == 'Aout':
        total_routedesfestivals.set_value(i, 'month', 8.0)    
    if month == 'Sep.':
        total_routedesfestivals.set_value(i, 'month', 9.0)
    if month == 'Oct.':
        total_routedesfestivals.set_value(i, 'month', 10.0)        
    if month == 'Nov.':
        total_routedesfestivals.set_value(i, 'month', 11.0)   
    if month == 'Dec.':
        total_routedesfestivals.set_value(i, 'month', 12.0)
        
total_routedesfestivals.dropna(0, inplace=True)      
total_routedesfestivals.month.apply(lambda x: pd.to_numeric(x))
total_routedesfestivals['Date'] = pd.to_datetime(total_routedesfestivals.year*10000 + total_routedesfestivals.month*100 + total_routedesfestivals.day, format="%Y%m%d")
total_routedesfestivals.drop(['day', 'month', 'year'], 1, inplace=True)
total_routedesfestivals.columns = ['Venue', 'Artist', 'City', 'Date']
total_routedesfestivals['Date'] = total_routedesfestivals['Date'].apply( lambda x: x.date() )
total_routedesfestivals

Unnamed: 0,Venue,Artist,City,Date
0,6 HOURS OF SYMPHONIA,SYNMETALIUM,Lausanne,2017-04-01
1,6 HOURS OF SYMPHONIA,EVENMORE,Lausanne,2017-04-01
2,6 HOURS OF SYMPHONIA,SECHEM,Lausanne,2017-04-01
3,6 HOURS OF SYMPHONIA,BEYOND FORGIVENESS,Lausanne,2017-04-01
4,ANTIGEL,ZERO,Geneve,2017-01-27
5,ANTIGEL,MAY B (MAGUY MARIN),Geneve,2017-01-28
6,ANTIGEL,MAY B (MAGUY MARIN),Geneve,2017-01-29
7,ANTIGEL,TRENTEMOLLER,Geneve,2017-02-02
8,ANTIGEL,THE NOTWIST,Geneve,2017-02-03
9,ANTIGEL,HENRI DES,Geneve,2017-02-05


#### Unifying all

In [142]:
total_eventsch.set_index('Date', drop=True, append=False, inplace=True)
total_bands_in_town.set_index('Date', drop=True, append=False, inplace=True)
#total_residentadvisor.set_index('event_venue.name', drop=True, append=False, inplace=True)
total_routedesfestivals.set_index('Date', drop=True, append=False, inplace=True)

In [144]:
total_events = pd.concat([total_eventsch, total_bands_in_town, total_routedesfestivals])
del total_events.index.name
total_events

Unnamed: 0,Artist,City,Genre,Venue
2017-01-20,"DJs Patric Pleasure, Ramon Ramones",Basel,"Hip Hop, R'n'B",Balz
2017-01-19,"Bülent Ceylan (DE), Raffi Lusso, Miguel M",Zürich,"Ragga, Reggae, African Music, Dancehall",Vior Club
2017-01-19,"Nicole Johänntgen, Laurent Coulondre, Nicole J...",Zürich,"Jazz, Blues, Soul",Moods
2017-01-19,"Kool Savas (D), Vega & Bosca (D)",Basel,"Hip Hop, R'n'B",Kaserne
2017-01-19,"Menic And The Bruncrana Firecrackers, Menic (v...",Bern,"Rock, Indie, Punk, Heavy Metal, Gothic",Mahogany Hall
2017-01-19,Charles Davis & Captured Moments,Winterthur,"Jazz, Blues, Soul",Esse Musicbar
2017-01-19,Tin Soldiers,Frauenfeld,"Rock, Indie, Punk, Heavy Metal, Gothic",Eisenwerk
2017-01-19,Anna Mae,Unterägeri,"Rock, Indie, Punk, Heavy Metal, Gothic",Seminar Hotel
2017-01-19,"Gigi Moto, Gigi Moto (voc), JP von Dach (g)",Zürich,"Hip Hop, R'n'B",Lebewohlfabrik
2017-01-18,"Michel Wyss (voc), Ali Salvioni (perc), Mischa...",Bern,Pop,Ono - Das Kulturlokal


In [145]:
#Write the DataFrame to a csv file
filename = 'total_events.csv'
pd.DataFrame(total_events, columns=list(total_events.columns)).to_csv(filename, index=False, encoding="utf-8")
print('Total events data saved to file')

Total events data saved to file
