# Venue data of cities from Foursquare API

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
pd.options.display.max_columns = 500

In [2]:
# os.getenv('PATH')

In [3]:
# os.getenv('PWD')

## Foursquare API test with Berlin

In [4]:
# run source CREDENTIALS command before jupyter notebook

In [5]:
client_id = os.getenv('FOURSQUARE_CLIENT_ID')
client_secret = os.getenv('FOURSQUARE_CLIENT_SECRET')
version = '20180323'

In [6]:
test_limit = 100

In [7]:
# Foursquare API call example - WORKS

url = 'https://api.foursquare.com/v2/venues/explore'

params = (dict(
    client_id=client_id,
    client_secret=client_secret,
    v='20180323',
    #ll='40.7243,-74.0018',
    near='Berlin',
    #query='coffee',
    limit=test_limit
))

resp = requests.get(url=url, params=params)
data = json.loads(resp.text)

In [8]:
#resp.json()

In [9]:
venues = resp.json()

In [10]:
n_results = venues['response']['totalResults']
n_results

217

In [11]:
v_name = venues['response']['groups'][0]['items'][11]['venue']['name']
v_name

'Park am Gleisdreieck - Ostpark'

In [12]:
v_cat_name = venues['response']['groups'][0]['items'][11]['venue']['categories'][0]['name']
v_cat_name

'Park'

In [13]:
len(range(test_limit))

100

In [14]:
v_list = []

for i in range(test_limit):
    v_name = venues['response']['groups'][0]['items'][i]['venue']['name']
    v_cat_name = venues['response']['groups'][0]['items'][i]['venue']['categories'][0]['name']
    v_list.append((v_name, v_cat_name))

In [15]:
v_list[0:9]

[('Tiergarten', 'Park'),
 ('Landwehrkanal', 'Canal'),
 ('Urban Nation', 'Art Gallery'),
 ('Tempelhofer Feld', 'Park'),
 ('Café Komine', 'Café'),
 ('Treptower Park', 'Park'),
 ('Classic Remise Berlin', 'Museum'),
 ('Das Stue', 'Hotel'),
 ('Viktoria-Luise-Platz', 'Plaza')]

In [16]:
v_name_list = []
for j in range(test_limit):
    v_name = venues['response']['groups'][0]['items'][j]['venue']['name']
    v_name_list.append(v_name)

In [17]:
v_name_list[0:9]

['Tiergarten',
 'Landwehrkanal',
 'Urban Nation',
 'Tempelhofer Feld',
 'Café Komine',
 'Treptower Park',
 'Classic Remise Berlin',
 'Das Stue',
 'Viktoria-Luise-Platz']

In [18]:
v_cat_name_list = []
for k in range(test_limit):
    v_cat_name = venues['response']['groups'][0]['items'][k]['venue']['categories'][0]['name']
    v_cat_name_list.append(v_cat_name)

In [19]:
v_cat_name_list[0:9]

['Park',
 'Canal',
 'Art Gallery',
 'Park',
 'Café',
 'Park',
 'Museum',
 'Hotel',
 'Plaza']

In [20]:
count = [[x, v_cat_name_list.count(x)] for x in set(v_cat_name_list)]

In [21]:
sorted(count)[0:9]

[['Art Gallery', 2],
 ['Art Museum', 1],
 ['Arts & Crafts Store', 1],
 ['Austrian Restaurant', 1],
 ['Bakery', 5],
 ['Bar', 3],
 ['Beer Bar', 2],
 ['Beer Garden', 1],
 ['Beer Store', 1]]

## Create a test DataFrame for Berlin

In [22]:
dict_test = {'city': 'berlin', 
             'venue_name': v_name_list, 
             'venue_category': v_cat_name_list}

In [23]:
#dict_test

In [24]:
pd.DataFrame(dict_test).head()

Unnamed: 0,city,venue_name,venue_category
0,berlin,Tiergarten,Park
1,berlin,Landwehrkanal,Canal
2,berlin,Urban Nation,Art Gallery
3,berlin,Tempelhofer Feld,Park
4,berlin,Café Komine,Café


## Create a test function for Berlin

In [25]:
def venues_of_a_city(city, lim=100):
    
    '''This function collects the venues of a city from the Foursquare API
    and returns the results as a Pandas DataFrame'''
    
    # Foursquare API
    client_id = os.getenv('FOURSQUARE_CLIENT_ID')
    client_secret = os.getenv('FOURSQUARE_CLIENT_SECRET')
    version = '20180323'
    
    url = 'https://api.foursquare.com/v2/venues/explore'

    params = (dict(
        client_id=client_id,
        client_secret=client_secret,
        v=version,
        near=city,
        limit=lim
    ))
    
    resp = requests.get(url=url, params=params)
    
    venues = resp.json()
    
    # get the venue names
    v_name_list = []
    for j in range(lim):
        v_name = venues['response']['groups'][0]['items'][j]['venue']['name']
        v_name_list.append(v_name)
    
    # get the venue categories
    v_cat_name_list = []
    for k in range(lim):
        v_cat_name = venues['response']['groups'][0]['items'][k]['venue']['categories'][0]['name']
        v_cat_name_list.append(v_cat_name)
    
    # create a dictionary
    dict_berlin = {'city': city, 
                   'venue_name': v_name_list, 
                   'venue_category': v_cat_name_list}
    
    # create a DateFrame
    df_berlin = pd.DataFrame(dict_berlin)
    
    return df_berlin


In [26]:
df_berlin = venues_of_a_city('Berlin')
df_berlin.head()

Unnamed: 0,city,venue_name,venue_category
0,Berlin,Tiergarten,Park
1,Berlin,Landwehrkanal,Canal
2,Berlin,Urban Nation,Art Gallery
3,Berlin,Tempelhofer Feld,Park
4,Berlin,Café Komine,Café


## Transform the test df

In [27]:
berlin_dummies = pd.get_dummies(df_berlin[['venue_category']])
berlin_dummies['city'] = df_berlin['city']
cols = list(berlin_dummies.columns)
cols = [cols[-1]] + cols[:-1]
berlin_dummies = berlin_dummies[cols]
berlin_dummies.head()

Unnamed: 0,city,venue_category_Art Gallery,venue_category_Art Museum,venue_category_Arts & Crafts Store,venue_category_Austrian Restaurant,venue_category_Bakery,venue_category_Bar,venue_category_Beer Bar,venue_category_Beer Garden,venue_category_Beer Store,venue_category_Bike Rental / Bike Share,venue_category_Bistro,venue_category_Bookstore,venue_category_Breakfast Spot,venue_category_Café,venue_category_Canal,venue_category_Caucasian Restaurant,venue_category_Climbing Gym,venue_category_Cocktail Bar,venue_category_Coffee Shop,venue_category_Concert Hall,venue_category_Event Space,venue_category_Falafel Restaurant,venue_category_Farmers Market,venue_category_Food Court,venue_category_Forest,venue_category_French Restaurant,venue_category_Gourmet Shop,venue_category_Hostel,venue_category_Hotel,venue_category_Ice Cream Shop,venue_category_Indie Movie Theater,venue_category_Indie Theater,venue_category_Juice Bar,venue_category_Karaoke Bar,venue_category_Korean Restaurant,venue_category_Mediterranean Restaurant,venue_category_Monument / Landmark,venue_category_Mountain,venue_category_Museum,venue_category_Music Venue,venue_category_Park,venue_category_Pizza Place,venue_category_Plaza,venue_category_Scenic Lookout,venue_category_Spa,venue_category_Spanish Restaurant,venue_category_Vegetarian / Vegan Restaurant,venue_category_Whisky Bar,venue_category_Wine Bar
0,Berlin,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,Berlin,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Berlin,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Berlin,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,Berlin,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
berlin_dummies.shape

(100, 50)

In [29]:
berlin_dummies['venue_category_Park'].sum()

13

## Define a function that collects all the venues of the capital cities of Europe

In [30]:
def venue_collector(lst, lim=100):
    
    '''This function collects the venues of a list of cities from the Foursquare API
    and returns the results as a Pandas DataFrame'''
    
    # Foursquare API
    client_id = os.getenv('FOURSQUARE_CLIENT_ID')
    client_secret = os.getenv('FOURSQUARE_CLIENT_SECRET')
    version = '20180323'
    
    list_of_dfs = []
    
    for city in lst:
        
        url = 'https://api.foursquare.com/v2/venues/explore'

        params = (dict(
            client_id=client_id,
            client_secret=client_secret,
            v=version,
            near=city,
            limit=lim
        ))
    
        resp = requests.get(url=url, params=params)
    
        venues = resp.json()
        
        # number of results of the API response
        n_results = venues['response']['totalResults']
    
        # get the venue names
        v_name_list = []
        for j in range(lim if lim < n_results else n_results): # use the num of results if its less than the limit
            v_name = venues['response']['groups'][0]['items'][j]['venue']['name']
            v_name_list.append(v_name)
    
        # get the venue categories
        v_cat_name_list = []
        for k in range(lim if lim < n_results else n_results):
            v_cat_name = venues['response']['groups'][0]['items'][k]['venue']['categories'][0]['name']
            v_cat_name_list.append(v_cat_name)
    
        # create a dictionary
        dict_city = {'city': city, 
                       'venue_name': v_name_list, 
                       'venue_category': v_cat_name_list}
    
        # create a DateFrame
        df_city = pd.DataFrame(dict_city)
        
        # append it the the list of DataFrames
        list_of_dfs.append(df_city)
    
    # concatenate all DataFrames
    df = pd.concat(list_of_dfs).reset_index(drop=True)
    
    return df

## Test the function

In [39]:
test_cap_cities_europe = ['Amsterdam',
                          'Athens',
                          'Andorra la Vella',
                          'Belgrade',
                          'Berlin',
                          'Bern']

In [40]:
test_cap_venues = venue_collector(test_cap_cities_europe)

test_cap_venues.head()

Unnamed: 0,city,venue_name,venue_category
0,Amsterdam,Vondelpark,Park
1,Amsterdam,Amsterdamse Grachten,Canal
2,Amsterdam,Concerto Records,Record Shop
3,Amsterdam,Zoku,Hotel
4,Amsterdam,Margherita Tutta La Vita!,Pizza Place


## Run the function on all the capital cities of Europe

In [41]:
cap_cities_europe = ['Amsterdam',
                     'Andorra la Vella',
                     'Athens',
                     'Belgrade',
                     'Berlin',
                     'Bern',
                     'Bratislava',
                     'Brussels',
                     'Bucharest',
                     'Budapest',
                     'Chisinau',
                     'Copenhagen',
                     'Dublin',
                     'Helsinki',
                     'Kiev',
                     'Lisbon',
                     'Ljubljana',
                     'London',
                     'Luxembourg',
                     'Madrid',
                     'Minsk',
                     'Monaco',
                     'Moscow',
                     'Nicosia',
                     'Nuuk',
                     'Oslo',
                     'Paris',
                     'Podgorica',
                     'Prague',
                     'Reykjavik',
                     'Riga',
                     'Rome',
                     'San Marino',
                     'Sarajevo',
                     'Skopje',
                     'Sofia',
                     'Stockholm',
                     'Tallinn',
                     'Tirana',
                     'Vaduz',
                     'Valletta',
                     'Vatican City',
                     'Vienna',
                     'Vilnius',
                     'Warsaw',
                     'Zagreb'] # 'Zurich'

In [42]:
len(cap_cities_europe)

46

In [43]:
cap_cities_europe_venues = venue_collector(cap_cities_europe)

In [44]:
cap_cities_europe_venues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4314 entries, 0 to 4313
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   city            4314 non-null   object
 1   venue_name      4314 non-null   object
 2   venue_category  4314 non-null   object
dtypes: object(3)
memory usage: 101.2+ KB


In [45]:
cap_cities_europe_venues.head()

Unnamed: 0,city,venue_name,venue_category
0,Amsterdam,Vondelpark,Park
1,Amsterdam,Amsterdamse Grachten,Canal
2,Amsterdam,Concerto Records,Record Shop
3,Amsterdam,Zoku,Hotel
4,Amsterdam,Margherita Tutta La Vita!,Pizza Place


## Check and clear the venue categories

In [46]:
df1 = cap_cities_europe_venues.copy()

In [47]:
df1['venue_category'].value_counts()

Park                   277
Café                   203
Hotel                  194
Coffee Shop            185
Restaurant             130
                      ... 
Paella Restaurant        1
Basketball Court         1
Garden Center            1
Filipino Restaurant      1
Comic Shop               1
Name: venue_category, Length: 355, dtype: int64

In [48]:
df1['venue_category'].nunique()

355

In [49]:
df1['venue_category'].unique()

array(['Park', 'Canal', 'Record Shop', 'Hotel', 'Pizza Place', 'Hostel',
       'French Restaurant', 'Yoga Studio', 'Wine Bar', 'Cocktail Bar',
       'Concert Hall', 'Coffee Shop', 'Brewery', 'Breakfast Spot', 'Pub',
       'Garden', 'Bakery', 'Theater', 'Deli / Bodega', 'Dance Studio',
       'Distillery', 'Event Space', 'Plaza', 'Bookstore', 'Dessert Shop',
       'Sandwich Place', 'Turkish Restaurant', 'Museum', 'Gastropub',
       'Farmers Market', 'Bistro', 'Art Museum', 'Tour Provider',
       'Liquor Store', 'Music Venue', 'Café', 'Cycle Studio',
       'Monument / Landmark', 'Beer Bar', 'Caribbean Restaurant',
       'Multiplex', 'Grocery Store', 'Ice Cream Shop',
       'Falafel Restaurant', 'Restaurant', 'Japanese Restaurant', 'Bar',
       'Supermarket', 'Movie Theater', 'Marijuana Dispensary',
       'Italian Restaurant', 'Indie Movie Theater', 'Cheese Shop',
       'BBQ Joint', 'Chocolate Shop', 'Design Studio',
       'Comfort Food Restaurant', 'Arcade', 'Lounge',
      

**create a new column and standardize the categories**

In [50]:
df2 = df1.copy()
df2['venue_cat_standard'] = df2['venue_category']
df2.head()

Unnamed: 0,city,venue_name,venue_category,venue_cat_standard
0,Amsterdam,Vondelpark,Park,Park
1,Amsterdam,Amsterdamse Grachten,Canal,Canal
2,Amsterdam,Concerto Records,Record Shop,Record Shop
3,Amsterdam,Zoku,Hotel,Hotel
4,Amsterdam,Margherita Tutta La Vita!,Pizza Place,Pizza Place


In [51]:
# 1. Restaurant
df2.loc[df2['venue_cat_standard'].str.contains('Restaurant'), 'venue_cat_standard'] = 'Restaurant'

# 2. Museum
df2.loc[df2['venue_cat_standard'].str.contains('Museum'), 'venue_cat_standard'] = 'Museum'

# 3. Club
df2.loc[df2['venue_cat_standard'].str.contains('Club'), 'venue_cat_standard'] = 'Club'

# 3. Bar
df2.loc[df2['venue_cat_standard'].str.contains('Bar'), 'venue_cat_standard'] = 'Bar'

# 4. Cafe
df2.loc[df2['venue_cat_standard'].str.contains('Cafe'), 'venue_cat_standard'] = 'Café'

In [52]:
df2['venue_cat_standard'].value_counts()

Restaurant            760
Bar                   343
Park                  277
Café                  207
Hotel                 194
                     ... 
Frozen Yogurt Shop      1
Aquarium                1
Roof Deck               1
Food Service            1
Drive-in Theater        1
Name: venue_cat_standard, Length: 269, dtype: int64

In [53]:
df2['venue_cat_standard'].nunique()

269

In [54]:
df2['venue_cat_standard'].unique()

array(['Park', 'Canal', 'Record Shop', 'Hotel', 'Pizza Place', 'Hostel',
       'Restaurant', 'Yoga Studio', 'Bar', 'Concert Hall', 'Coffee Shop',
       'Brewery', 'Breakfast Spot', 'Pub', 'Garden', 'Bakery', 'Theater',
       'Deli / Bodega', 'Dance Studio', 'Distillery', 'Event Space',
       'Plaza', 'Bookstore', 'Dessert Shop', 'Sandwich Place', 'Museum',
       'Gastropub', 'Farmers Market', 'Bistro', 'Tour Provider',
       'Liquor Store', 'Music Venue', 'Café', 'Cycle Studio',
       'Monument / Landmark', 'Multiplex', 'Grocery Store',
       'Ice Cream Shop', 'Supermarket', 'Movie Theater',
       'Marijuana Dispensary', 'Indie Movie Theater', 'Cheese Shop',
       'BBQ Joint', 'Chocolate Shop', 'Design Studio', 'Arcade', 'Lounge',
       'Public Art', 'Sporting Goods Shop', 'Spa', 'Resort',
       'Basketball Court', 'Stadium', 'Shopping Mall',
       'Miscellaneous Shop', 'Historic Site', 'Food & Drink Shop',
       'Lingerie Store', 'Buffet', 'Gym', 'Gym / Fitness Center',


In [55]:
df2.head()

Unnamed: 0,city,venue_name,venue_category,venue_cat_standard
0,Amsterdam,Vondelpark,Park,Park
1,Amsterdam,Amsterdamse Grachten,Canal,Canal
2,Amsterdam,Concerto Records,Record Shop,Record Shop
3,Amsterdam,Zoku,Hotel,Hotel
4,Amsterdam,Margherita Tutta La Vita!,Pizza Place,Pizza Place


## Group by cities and venue categories

In [56]:
df3 = df2.copy()

In [57]:
df3['venue_cat_count'] = 1

In [58]:
df3.head()

Unnamed: 0,city,venue_name,venue_category,venue_cat_standard,venue_cat_count
0,Amsterdam,Vondelpark,Park,Park,1
1,Amsterdam,Amsterdamse Grachten,Canal,Canal,1
2,Amsterdam,Concerto Records,Record Shop,Record Shop,1
3,Amsterdam,Zoku,Hotel,Hotel,1
4,Amsterdam,Margherita Tutta La Vita!,Pizza Place,Pizza Place,1


In [59]:
df3.isna().sum()

city                  0
venue_name            0
venue_category        0
venue_cat_standard    0
venue_cat_count       0
dtype: int64

In [60]:
df3['venue_category'].nunique()

355

In [61]:
df3['venue_cat_standard'].nunique()

269

In [62]:
df3 = df3.groupby(['city', 'venue_cat_standard']).agg({'venue_cat_count': 'sum'})

In [63]:
df3 = df3.reset_index()

In [64]:
df3.head()

Unnamed: 0,city,venue_cat_standard,venue_cat_count
0,Amsterdam,Arcade,1
1,Amsterdam,BBQ Joint,1
2,Amsterdam,Bakery,2
3,Amsterdam,Bar,7
4,Amsterdam,Bistro,1


In [66]:
df3.describe()

Unnamed: 0,venue_cat_count
count,1762.0
mean,2.448354
std,3.520992
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,34.0


**Check if the number of parks in Berlin is still 13**

In [None]:
df3[(df3['city'] == 'Berlin') & (df3['venue_cat_standard'] == 'Park')]

## Tranform the venue categories to columns

In [None]:
df4 = df3.copy()

In [None]:
df4 = df4.pivot(index='city', columns='venue_cat_standard', values='venue_cat_count')

In [None]:
df4.shape

In [None]:
df4.head()

In [None]:
df4 = df4.reset_index()

In [None]:
df4.head()

In [None]:
df4.tail()

In [None]:
df4.info()

## Export to JSON format

In [None]:
df4.to_json('../data/Foursquare_venues_data.json')

In [None]:
check = pd.read_json('../data/Foursquare_venues_data.json')
check.head()