# Venue Location
In this notebook, we will find the location of all the venues and include these information in the events

We will use Google Places API


In [10]:
import pandas as pd
import os
import GooglePlaceHelper
import numpy as np
from collections import Counter
import math
from googleplaces import GooglePlaces, types, lang


# Preprocess venue

In [11]:
total_events = pd.read_csv(os.path.join('./total_events_preprocessed.csv'))

def removeSpecialCharacters(input_string):
    input_string = str(input_string)
    while('-' in input_string or '+' in input_string or '=' in input_string or '.' in input_string or '\'' in input_string or '?' in input_string):
        input_string = input_string.replace('-','')
        input_string = input_string.replace('+','')
        input_string = input_string.replace('=','')
        input_string = input_string.replace('.','')
        input_string = input_string.replace('\'','')
        input_string = input_string.replace('?','')
    return input_string

def removeWords(input_string):
    input_string = str(input_string)
    while('(live)' in input_string ):
        input_string = input_string.replace('(live)','')
    return input_string

def removeSpaces(input_string):
    input_string = str(input_string)
    input_string = input_string.strip()
    return input_string

def replaceVenue_at(input_string):
    input_string = str(input_string)
    while '@' in input_string:
        position_venue_beginning = input_string.find('@')
        input_string = input_string[position_venue_beginning+1:]
        input_string = removeSpaces(input_string)
        input_string = replaceVenue_at(input_string)
    return input_string

def cleanVenueSpecificName(row, specific_name, specific_city, new_name):
    if specific_name in str(row['Venue']) and specific_city in str(row['City']):
            return new_name
    return row['Venue']


def correctVenueName(df, correct_name, incorrect_name):
    # This function corrects the venue with @incorrect_name to @correct_name in @df
    # If enough latitude, longitude, adress are present in the data to be modified, 
    #these data are assigned to the newly modified data, using argmax.
    
    df_tocorrect = df[df['Venue'].str.contains(incorrect_name,na=False,case = False) == True]
    df_coord = df_tocorrect[df_tocorrect['Latitude'].notnull()]
    df_adress = df_tocorrect[df_tocorrect['Adress'].notnull()]
    

    if len(df_coord)>math.floor(len(df_tocorrect)*2/3):

        #latitude = np.mean(df_coord['Latitude'])
        b = Counter(df_coord['Latitude'])
        latitude = b.most_common()[0][0]
        #longitude = np.mean(df_coord['Longitude'])
        b = Counter(df_coord['Longitude'])
        longitude = b.most_common()[0][0]

        df.loc[df['Venue'].str.contains(incorrect_name,na=False,case = False) == True, 'Latitude'] = latitude
        df.loc[df['Venue'].str.contains(incorrect_name,na=False,case = False) == True, 'Longitude'] = longitude
    else:
        df.loc[df['Venue'].str.contains(incorrect_name,na=False,case = False) == True, 'Latitude'] = np.nan
        df.loc[df['Venue'].str.contains(incorrect_name,na=False,case = False) == True, 'Longitude'] = np.nan
    if len(df_adress)>0:
        df.loc[df['Venue'].str.contains(incorrect_name,na=False,case = False) == True, 'Adress'] = df_adress.iloc[0]['Adress']
    
    df.loc[df['Venue'].str.contains(incorrect_name,na=False,case = False) == True, 'Venue'] = correct_name

    return df

def correctVenueNameCity(df, correct_name, incorrect_name, city):
    # This function corrects the venue with @incorrect_name to @correct_name in @df, using @city as aditional condition
    # If enough latitude, longitude, adress are present in the data to be modified, 
    #these data are assigned to the newly modified data, using argmax.

    df_tocorrect = df[(df['Venue'].str.contains(incorrect_name,na=False,case = False) == True) & 
                     (total_events['City'].str.contains(city,na=False,case = False) == True)]
    df_coord = df_tocorrect[df_tocorrect['Latitude'].notnull()]
    df_adress = df_tocorrect[df_tocorrect['Adress'].notnull()]

    if len(df_coord)>math.floor(len(df_tocorrect)*2/3):
        
        # We take the argmax
        #latitude = np.mean(df_coord['Latitude'])
        #longitude = np.mean(df_coord['Longitude'])
        b = Counter(df_coord['Latitude'])
        latitude = b.most_common()[0][0]
        b = Counter(df_coord['Longitude'])
        longitude = b.most_common()[0][0]

        df.loc[(df['Venue'].str.contains(incorrect_name,na=False,case = False)) & 
               (df['City'].str.contains(city,na=False,case = False)), 'Latitude'] = latitude
        df.loc[(df['Venue'].str.contains(incorrect_name,na=False,case = False)) & 
               (df['City'].str.contains(city,na=False,case = False)), 'Longitude'] = longitude
    else:
        df.loc[(df['Venue'].str.contains(incorrect_name,na=False,case = False)) & 
               (df['City'].str.contains(city,na=False,case = False)), 'Latitude'] = np.nan
        df.loc[(df['Venue'].str.contains(incorrect_name,na=False,case = False)) & 
               (df['City'].str.contains(city,na=False,case = False)), 'Longitude'] = np.nan    
    if len(df_adress)>0:
        df.loc[(df['Venue'].str.contains(incorrect_name,na=False,case = False)) & 
               (df['City'].str.contains(city,na=False,case = False)), 'Adress'] = df_adress.iloc[0]['Adress']

    
    df.loc[(df['Venue'].str.contains(incorrect_name,na=False,case = False)) & (df['City'].str.contains(city,na=False,case = False)), 'Venue'] = correct_name

    return df


## Clean city

In [12]:
total_events['City'] = total_events['City'].apply(lambda x: removeSpaces(x))
total_events['City'] = total_events['City'].apply(lambda x: str(x).lower())
total_events['City'] = total_events['City'].apply(lambda x: str(x).capitalize())

total_events.loc[total_events['City'].str.contains('Genev',na=False,case = False) == True, 'City'] = 'Geneva'
total_events.loc[total_events['City'].str.contains('Genèv',na=False,case = False) == True, 'City'] = 'Geneva'
total_events.loc[total_events['City'].str.contains('genf',na=False,case = False) == True, 'City'] = 'Geneva'
total_events.loc[total_events['City'].str.contains('Zurich',na=False,case = False) == True, 'City'] = 'Zürich'
total_events.loc[total_events['City'].str.contains('Yverd',na=False,case = False) == True, 'City'] = 'Yverdon-les-bains'
total_events.loc[total_events['City'].str.contains('base',na=False,case = False) == True, 'City'] = 'Basel'
total_events.loc[total_events['City'].str.contains('bale',na=False,case = False) == True, 'City'] = 'Basel'


## Clean venue names

In [13]:
# Remove spaces 
total_events['Venue'] = total_events['Venue'].apply(lambda x: removeSpaces(x))

# Remove special characters in Venue name
total_events['Venue'] = total_events['Venue'].apply(lambda x: removeSpecialCharacters(x))
total_events['Venue'] = total_events['Venue'].apply(lambda x: replaceVenue_at(x))
total_events['Venue'] = total_events['Venue'].apply(lambda x: GooglePlaceHelper.remove_accents(x))


# Put all the venue name in lower case:
total_events['Venue'] = total_events['Venue'].apply(lambda x: str(x).lower())
# Capitalize first letter only:
total_events['Venue'] = total_events['Venue'].apply(lambda x: str(x).capitalize())


In [14]:
total_events = correctVenueName(total_events, 'Paleo Festival','Paleo')
total_events = correctVenueName(total_events, 'D! Club','D!')
total_events = correctVenueName(total_events, 'Palladium','palladium')
total_events = correctVenueName(total_events, 'La parenthese','paranthese')
total_events = correctVenueName(total_events, 'La parenthese','parenthese')
total_events = correctVenueName(total_events, 'Zurich openair Festival','zurich openair')
total_events = correctVenueName(total_events, 'Zurich openair Festival','zurich open air')
total_events = correctVenueName(total_events, 'Caprice Festival','caprice')
total_events = correctVenueName(total_events, 'Caribana Festival','caribana')
total_events = correctVenueName(total_events, 'Balelec','ecole polytechnique')
total_events = correctVenueName(total_events, 'Le Romandie','romandie')
total_events = correctVenueName(total_events, 'Kofmehl','Kofmehl')
total_events = correctVenueName(total_events, 'Meh suff Festival','Meh suf')
total_events = correctVenueName(total_events, 'Montreux Jazz Festival','Montreux jazz')
total_events = correctVenueName(total_events, 'Music summit Festival','Music summit')
total_events = correctVenueName(total_events, 'One FM','One fm')
total_events = correctVenueName(total_events, 'Oxa','Oxa')
total_events = correctVenueName(total_events, 'Rabadan','Rabadan')
total_events = correctVenueName(total_events, 'One FM','One fm')
total_events = correctVenueNameCity(total_events, 'Mad Club','mad','Lausanne')
total_events = correctVenueNameCity(total_events, 'Mad Club','mad','Genev')
total_events = correctVenueName(total_events, 'Balelec','balalec')

total_events.loc[(total_events['Venue'].str.contains('balelec',na=False,case = False)) & 
               (total_events['City'].str.contains('lausanne',na=False,case = False) == False), 'Longitude'] = np.nan
total_events.loc[(total_events['Venue'].str.contains('balelec',na=False,case = False)) & 
               (total_events['City'].str.contains('lausanne',na=False,case = False) == False), 'Latitude'] = np.nan
total_events.loc[(total_events['Venue'].str.contains('balelec',na=False,case = False)), 'City'] = 'Lausanne'
total_events = correctVenueNameCity(total_events, 'Balelec','Balelec','Lausanne')

total_events = correctVenueNameCity(total_events, 'L\' Usine a gaz','usine','nyon')
total_events = correctVenueNameCity(total_events, 'Bypass Club','bypass','Genev')
total_events = correctVenueNameCity(total_events, 'gare d','gare d','Will')
total_events = correctVenueNameCity(total_events, 'Globull','globu','Bulle')
total_events = correctVenueNameCity(total_events, 'GreenField festival','greenfield','Interlaken')
total_events = correctVenueNameCity(total_events, 'Gurten Festival','gurten','bern')
total_events = correctVenueNameCity(total_events, 'Hallenstadion','hallenstadion','zürich')
total_events = correctVenueNameCity(total_events, 'Hive club','hive','zürich')
total_events = correctVenueNameCity(total_events, 'Kaufleuten','Kaufleuten','zürich')
total_events = correctVenueNameCity(total_events, 'Maag Halle','Maag','zürich')
total_events = correctVenueNameCity(total_events, 'Planet 105','planet 105','zürich')
total_events = correctVenueNameCity(total_events, 'Street parade','streetparad','zürich')
total_events = correctVenueNameCity(total_events, 'AMR','amr','Genev')
total_events = correctVenueNameCity(total_events, 'Kammgarn','Kammgarn','Schaffhausen')
total_events = correctVenueNameCity(total_events, 'Kasern','kasern','Basel')
total_events = correctVenueNameCity(total_events, 'Kiff','Kiff','aarau')
total_events = correctVenueNameCity(total_events, 'Kulturfabrik','Kulturfabrik','Solothurn')
total_events.loc[(total_events['Venue'].str.contains('mica',na=False,case = False)) & 
               (total_events['City'].str.contains('lausanne',na=False,case = False)), 'Longitude'] = np.nan
total_events.loc[(total_events['Venue'].str.contains('mica',na=False,case = False)) & 
               (total_events['City'].str.contains('lausanne',na=False,case = False)), 'Latitude'] = np.nan
total_events = correctVenueNameCity(total_events, 'mica Club','mica','lausanne')
total_events = correctVenueNameCity(total_events, 'Parterre','parter','Basel')
total_events = correctVenueNameCity(total_events, 'Picadilly','picad','Brugg')
total_events = correctVenueNameCity(total_events, 'Pont rouge','pont','Pont rouge')
total_events = correctVenueNameCity(total_events, 'Kiff','Kiff','aarau')
total_events.loc[(total_events['Venue'].str.contains('usine',na=False,case = False)) & 
                 (total_events['Venue'].str.contains('Kugler',na=False,case = False) == False) &
                 (total_events['Venue'].str.contains('Theatre',na=False,case = False) == False) & 
                 (total_events['City'].str.contains('genev',na=False,case = False)), 'Venue'] = 'L\' Usine'


## Extract unique venue from events

In [81]:
# Keep only unique venues and their parameters
total_venues = total_events.drop_duplicates(subset='Venue')
total_venues.drop(['Artist','Date','genre','origin'], axis = 1, inplace = True)
total_venues.reset_index(inplace = True, drop = True)
print('Total number of unique venues in Switzerland :', len(total_venues))
total_venues_na = total_venues[total_venues['Latitude'].isnull()]
print('Total number of unique venues for which we don\'t have the coordinates :', len(total_venues_na))
total_venues.loc[total_venues['City'].str.contains('Ken'), 'City'] = 'Luzern'
total_venues.loc[total_venues['City'].str.contains('Rothis'), 'Adress'] = 'Rothis'
total_venues.loc[total_venues['Venue'].str.contains('Cry der – club daltitude'), 'Adress'] = 'Crans'

total_venues.loc[total_venues['Venue'].str.contains('Cry der – club daltitude'), 'Venue'] = 'Cry der club daltitude'
total_venues.loc[total_venues['Venue'].str.contains('Dancing schonbrunnen'), 'Adress'] = 'Munchenbuchsee'
total_venues.loc[total_venues['Venue'].str.contains('Dancing schonbrunnen'), 'City'] = 'Munchenbuchsee'

total_venues.loc[total_venues['Venue'].str.contains('Mir'), 'Adress'] = 'Oslostrasse 12, Dreispitz'
total_venues.loc[total_venues['Venue'].str.contains('Planet e'), 'Adress'] = 'Ohmweg 10'
total_venues.loc[total_venues['Venue'].str.contains('Provi buerglen'), 'Adress'] = 'Industriestrasse'
total_venues.loc[total_venues['Venue'].str.contains('Tresor club sihlbrugg'), 'Adress'] = 'Industrie Sihlbrugg'
total_venues.loc[total_venues['Venue'].str.contains('Villa foresta'), 'Adress'] = 'Via Villa Foresta'












Total number of unique venues in Switzerland : 22535
Total number of unique venues for which we don't have the coordinates : 2212


## Extract Latitude and Longitude for all venues using Google Places
For api restriction, this is done in three times, with 3 different IP adresses

In [82]:
api_key = 'AIzaSyAARtrlCcy_KoZhwzHo7K60Gq66fNneTFc'
total_venues1 = total_venues[:8500]
total_venues2 = total_venues[8500:15000]
total_venues3 = total_venues[22529:]

In [None]:
not_found_cter = GooglePlaceHelper.getDataGooglePlace(total_venues1,api_key, 1)
percentage_not_found_cter = round(100*not_found_cter/len(total_venues),2)
print('\n Percentage of data not found : ', percentage_not_found_cter)

0 / 8499
To find : Balz  in  Basel, Switzerland
Found : Balz
Not exactly found up to here:  0
1 / 8499
To find : Vior club  in  Zürich, Switzerland
Found : Vior
2 / 8499
To find : Moods  in  Zürich, Switzerland
Found : Moods
3 / 8499
4 / 8499
To find : Mahogany hall  in  Bern, Switzerland
Found : Mahogany Hall
5 / 8499
To find : Esse musicbar  in  Winterthur, Switzerland
Found : ESSE
6 / 8499
To find : Eisenwerk  in  Frauenfeld, Switzerland
Found : Eisenwerk Genossenschaft Frauenfeld
7 / 8499
To find : Seminar hotel  in  Unterägeri, Switzerland
Found : SeminarHotel am Aegerisee
8 / 8499
To find : Lebewohlfabrik  in  Zürich, Switzerland
Found : Lebewohlfabrik
9 / 8499
To find : Ono  das kulturlokal  in  Bern, Switzerland
Found : Theater ONO
10 / 8499
To find : Villa strauli  in  Winterthur, Switzerland
Found : Villa Sträuli
11 / 8499
To find : Bar 59  in  Luzern, Switzerland
Found : Bar 59
12 / 8499
To find : Madeleine  in  Luzern, Switzerland
Found : Madeleine Gastro GmbH
13 / 8499
To 

In [10]:
not_found_cter = GooglePlaceHelper.getDataGooglePlace(total_venues2,api_key, 2)
percentage_not_found_cter = round(100*not_found_cter/len(total_venues),2)
print('\n Percentage of data not found : ', percentage_not_found_cter)

Not exactly found up to here:  0
14951 / 14999
14952 / 14999
14953 / 14999
14954 / 14999
14955 / 14999
14956 / 14999
14957 / 14999
14958 / 14999
14959 / 14999
14960 / 14999
14961 / 14999
14962 / 14999
14963 / 14999
14964 / 14999
14965 / 14999
14966 / 14999
14967 / 14999
14968 / 14999
14969 / 14999
14970 / 14999
14971 / 14999
14972 / 14999
14973 / 14999
14974 / 14999
14975 / 14999
14976 / 14999
14977 / 14999
14978 / 14999
14979 / 14999
14980 / 14999
14981 / 14999
14982 / 14999
14983 / 14999
14984 / 14999
14985 / 14999
14986 / 14999
14987 / 14999
14988 / 14999
14989 / 14999
14990 / 14999
14991 / 14999
14992 / 14999
14993 / 14999
14994 / 14999
14995 / 14999
14996 / 14999
14997 / 14999
14998 / 14999
14999 / 14999

 Percentage of data not found :  0.0


In [83]:
not_found_cter = GooglePlaceHelper.getDataGooglePlace(total_venues3,api_key, 3)
percentage_not_found_cter = round(100*not_found_cter/len(total_venues),2)
print('\n Percentage of data not found : ', percentage_not_found_cter)

22529 / 22534
To find : Villa foresta  in  Via Villa Foresta, Pietro, Switzerland
Found : Conwatec S.a g.l.
22530 / 22534
To find : Villa underground  in  Auf dem Wolf 4, 4053 Basel, Basel, Switzerland
Found : Villa Wenkenhof
22531 / 22534
To find : Viscose eventbar  in  Emmenweidstrasse 20, 6020, Emmenbrücke, Switzerland
Found : VISCOSE Bar Lounge Event
22532 / 22534
To find : Xellent club  in  Rue Centrale 17, 3963 Crans-Montana, Crans-montana, Switzerland
Found : Crans-Montana
22533 / 22534
To find : Zapoff  in  Rue de la Vigie 5, 1003 Lausanne, Lausanne, Switzerland
Found : U Bar
22534 / 22534
To find : Zenka  in  Rue de Genève 10, 1003 Lausanne, Lausanne, Switzerland
('Not found',)
Finally, Found : themata

 Percentage of data not found :  0.0


## Merge the latitude/longitude aquired before with the total list of events

Now in the list of events, the latitude and longitude is provided for every event.

In [None]:
total_venues = pd.read_csv(os.path.join('./GooglePlaceData/total_venue_GooglePlace.csv'))

total_venues.drop(['Adress'], axis = 1, inplace = True)
total_events.drop(['Adress','City','Latitude','Longitude'], axis = 1, inplace = True)
df_main = total_events.merge(total_venues,on='Venue',right_index=False,how='left')

In [None]:
filename = 'total_eventsFinal.csv'
folder = 'FinalResults'
destinationFileName = os.path.join(folder, filename)
pd.DataFrame(df_main, columns=list(df_main.columns)).to_csv(destinationFileName, index=False, encoding="utf-8")
print('Total events data geo saved to file')