Here, we will get the festival happening in Switzerland, using data from:
[BandsInTown API](http://www.bandsintown.com/api/overview).

## 1. Scrapping
Data that will be received and stored:
- 'id'
- 'bandsintown url'
- 'datetime'
- 'artists'
    - per artist:
        - 'name'
        - 'bandsintown url'
- 'venue'
    - 'id'
    - 'name'
    - 'bandsintown url'
    - 'city'
    - 'region'
    - 'latitude'
    - 'longitude'

Limitations
* The API limits the number of returned results, so a request for every day will be done.
* The API limits the radius to 150 miles so several cities will be specified. To avoid overlap, the id of the events will be checked
Cities : Geneva, Lausanne, Zurich, Berne, Basel, Lugano, Lucerne



In [1]:
import requests
import glob
import pandas as pd
import time
from scipy import stats
import urllib3
import json
% matplotlib inline
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:
base_url = 'http://api.bandsintown.com/events/search.json?api_version=2.0&app_id=epfl&radius=50'
cities = ['Geneva,Switzerland',
         'Lausanne,Switzerland',
         'Lucerne,Switzerland',
         'Zurich,Switzerland',
         'Bern,Switzerland',
         'Basel,Switzerland',
         'Locarno,Switzerland',
         ]
starting_year =  2010
ending_year =  2017


We will now loop over

In [15]:
def bandsInTownRequest(date_request,city_request):
    request_url = base_url + '&location=' + city_request + '&date=' + date_request
    data_json = requests.get(request_url).json()
    print(request_url)
    return data_json
    
def fillPandasJson(data_json):
        df_city = pd.DataFrame()
        if len(data_json)>0:
            
            # If Overload of servor, raise an error
            if 'errors' in data_json:
                raise Exception()
                
            if 'artists' in data_json[0]:                
                df_city = pd.io.json.json_normalize(data_json,meta_prefix='event_',
                                                    meta=['id','datetime','url',
                                                         ['venue','name'],
                                                         ['venue','url'],
                                                         ['venue','id'],
                                                         ['venue','city'],
                                                         ['venue','region'],
                                                         ['venue','country'],
                                                         ['venue','latitude'],
                                                         ['venue','longitude']]
                                                    ,record_path='artists',record_prefix='artist_')
            if len(df_city)>0:
                df_city = df_city[df_city['event_venue.country'] == 'Switzerland']
        return df_city


In [None]:
for year in range(starting_year,ending_year):
    for month in range (1,13):
        #Init
        df = pd.DataFrame()
        for day in range (1,32):
            date = str(year) + '-' + str("%02d" % month) + '-' + str("%02d" % day)
            date_request = date+'-'+date
            
            for city_request in cities:
                print('Now processing:',date, city_request)
                
                data_json = bandsInTownRequest(date_request,city_request)
                
                try:
                    df_city = fillPandasJson(data_json)
                
                #Error in the response of the server, due to overloading, wait and try again
                except Exception as inst:
                    print('Servers overloading, waiting a bit...')
                    time.sleep(20)
                    data_json = bandsInTownRequest(date_request,city_request)
                    df_city = fillPandasJson(data_json)
                    
                df = pd.concat([df,df_city])
                if len(df)>0:
                    df.drop_duplicates(inplace = True)
            clear_output(wait=True)
        

        
        #Save to file
        df.drop(['artist_mbid','event_venue.country'],axis = 1, inplace=True)
        filename = 'bands_in_town' + str("%02d" % month) + '-' + str(year) +'.csv'
        pd.DataFrame(df, columns=list(df.columns)).to_csv(filename, index=False, encoding="utf-8") 
        print(month,'/',year,' saved to file')
        

3 / 2010  saved to file


We open each file drop possible duplicates that may happen between end of months/beginning of months and store it again

In [None]:
all_files = glob.glob("bands_in_town*.csv")
df = pd.DataFrame()
df = pd.concat((pd.read_csv(f) for f in all_files))

#Drop possible duplicates
df.drop_duplicates(inplace = True)


#Save Again
pd.DataFrame(df, columns=list(df.columns)).to_csv('total_bands_in_town.csv', index=False, encoding="utf-8") 

In [None]:
df.tail()

In [None]:
if 'artists' in data_json[0]:
    print('s')