In [71]:
import requests
import pandas as pd
import numpy as np

# !pip install folium==0.5.0  
import folium 

# !pip install geocoder
import geocoder
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from bs4 import BeautifulSoup
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Use the requests library to download the webpage

In [72]:
url = 'https://en.wikipedia.org/wiki/List_of_districts_of_Istanbul'
html_data= requests.get(url).text

Parse the html data using beautiful_soup.

In [73]:
soup = BeautifulSoup(html_data, 'html5lib')

Using beautiful soup extract the table and store it into a dataframe named istanbul_data. The dataframe have columns District, Population, Area, Density, Annual Household Income($). Fill in each variable with the correct data from the list cell.

In [74]:
istanbul_data = pd.DataFrame(columns=['District', 'Population', 'Area(km2)', 'Density(per km2)', 'Annual Household Income($)'])

# skip the first empty row with [1:]
for row in soup.find('tbody').find_all('tr')[1:40]:
    item = row.find_all('td')
    district = item[0].text.replace('\n','')
    population = item[1].text.replace('\n','').replace(',','')
    # population = int(population)
    area = item[2].text.replace('\n','')
    density = item[3].text.replace('\n','')
    income = item[5].text.replace('\n','').split('(')[1].replace('$)','')
    
    istanbul_data = istanbul_data.append({'District': district, 'Population': population, 'Area(km2)': area, 'Density(per km2)': density, 'Annual Household Income($)': income}, ignore_index=True)

istanbul_data.head()
# istanbul_data.astype({'Population': 'int32'})


Unnamed: 0,District,Population,Area(km2),Density(per km2),Annual Household Income($)
0,Adalar,16033,11.05,1451,10978
1,Arnavutköy,296709,450.35,659,3350
2,Ataşehir,422594,25.23,16750,10854
3,Avcılar,436897,42.01,10400,6064
4,Bağcılar,737206,22.36,32970,5295


In [75]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Istanbul, Turkey'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [76]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in istanbul_data["District"].tolist() ]

In [77]:
coords

[[40.863060000000075, 29.12423000000007],
 [41.18558000000007, 28.74147000000005],
 [40.99248000000006, 29.127770000000055],
 [40.97813000000008, 28.721010000000035],
 [41.03323000000006, 28.863510000000076],
 [40.998050000000035, 28.84737000000007],
 [40.98273000000006, 28.87480000000005],
 [41.10745000000003, 28.800610000000063],
 [41.043810000000065, 28.91214000000008],
 [41.041900000000055, 29.005750000000035],
 [41.12647000000004, 29.097410000000025],
 [41.00579000000005, 28.65930000000003],
 [41.03672000000006, 28.986120000000028],
 [41.02046000000007, 28.587290000000053],
 [41.14369000000005, 28.460350000000062],
 [41.03290000000004, 29.173830000000066],
 [41.03932000000003, 28.881150000000048],
 [41.03246000000007, 28.660930000000064],
 [41.04657000000003, 28.931790000000035],
 [41.019580000000076, 28.94726000000003],
 [41.05728000000005, 28.91605000000004],
 [41.022050000000036, 28.878280000000075],
 [40.98867000000007, 29.02732000000003],
 [41.08011000000005, 28.9733200000000

In [78]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [79]:
# merge the coordinates into the original dataframe
istanbul_data['Latitude'] = df_coords['Latitude']
istanbul_data['Longitude'] = df_coords['Longitude']
istanbul_data.head()

Unnamed: 0,District,Population,Area(km2),Density(per km2),Annual Household Income($),Latitude,Longitude
0,Adalar,16033,11.05,1451,10978,40.86306,29.12423
1,Arnavutköy,296709,450.35,659,3350,41.18558,28.74147
2,Ataşehir,422594,25.23,16750,10854,40.99248,29.12777
3,Avcılar,436897,42.01,10400,6064,40.97813,28.72101
4,Bağcılar,737206,22.36,32970,5295,41.03323,28.86351


In [80]:
# Istanbul coordinates
latitude = 41.015137
longitude = 28.979530

# create map of Istanbul using latitude and longitude values
map_ist = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(istanbul_data['Latitude'], istanbul_data['Longitude'], istanbul_data['District']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_ist)  
    
map_ist

In [81]:
# define credentials
CLIENT_ID = 'U3OEYP4H1CZFH54N1MSBLG0YHJ5AET3BO4TJC3KWPXRTNVDW' # your Foursquare ID
CLIENT_SECRET = 'KHRSLJGTAHPYO4WJS3E42ZYTVQBULODHOEQ2XHPPK5LLR3SQ' # your Foursquare Secret
VERSION = '20210321' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [82]:
# Now, let's get the top 100 venues that are within a radius of 2000 meters.
radius = 2000

venues = []

for lat, long, neighborhood in zip(istanbul_data['Latitude'], istanbul_data['Longitude'], istanbul_data['District']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [83]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['District', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(3833, 7)


Unnamed: 0,District,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Adalar,40.86306,29.12423,Büyükada Lale köşkü,40.865657,29.125223,Bed & Breakfast
1,Adalar,40.86306,29.12423,Büyükada Tepesi,40.861107,29.117418,Mountain
2,Adalar,40.86306,29.12423,Eski Rum Yetimhanesi,40.861705,29.123323,Historic Site
3,Adalar,40.86306,29.12423,Büyükada Loc'Ada,40.857548,29.126157,Restaurant
4,Adalar,40.86306,29.12423,Büyükada Bisiklet Parkuru,40.865,29.116861,Bike Trail


In [84]:
# print out the list of categories 
venues_df['VenueCategory'].unique()[:50]

array(['Bed & Breakfast', 'Mountain', 'Historic Site', 'Restaurant',
       'Bike Trail', 'Hotel', 'Beach', 'Garden', 'Motel', 'Dessert Shop',
       'Bistro', 'Rest Area', 'Breakfast Spot', 'Ice Cream Shop',
       'Art Gallery', 'Campground', 'Café', 'Hot Dog Joint', 'Hotel Bar',
       'History Museum', 'Scenic Lookout', 'Entertainment Service',
       'Waterfront', 'National Park', 'Seafood Restaurant', 'Pool',
       'American Restaurant', 'Turkish Restaurant', 'Gourmet Shop',
       'Plaza', 'Pedestrian Plaza', 'Diner', 'Road',
       'Mediterranean Restaurant', 'Tea Room', 'Water Park', 'Trail',
       'BBQ Joint', 'Island', 'Kofte Place', 'Gym',
       'Gym / Fitness Center', 'Electronics Store', 'Bakery', 'Cafeteria',
       'Fast Food Restaurant', 'Hookah Bar',
       'Turkish Home Cooking Restaurant', 'Fish & Chips Shop', 'Buffet'],
      dtype=object)

### Analyze districts

In [85]:
# one hot encoding
ist_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ist_onehot['District'] = venues_df['District'] 

# move neighborhood column to the first column
fixed_columns = [ist_onehot.columns[-1]] + list(ist_onehot.columns[:-1])
ist_onehot = ist_onehot[fixed_columns]

print(ist_onehot.shape)
ist_onehot.head()

(3833, 302)


Unnamed: 0,District,Accessories Store,American Restaurant,Antique Shop,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,...,Water Park,Waterfront,Wedding Hall,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio,Çöp Şiş Place
0,Adalar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Adalar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Adalar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Adalar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Adalar,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
ist_grouped = ist_onehot.groupby(["District"]).mean().reset_index()

print(ist_grouped.shape)
ist_grouped

(39, 302)


Unnamed: 0,District,Accessories Store,American Restaurant,Antique Shop,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,...,Water Park,Waterfront,Wedding Hall,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yemeni Restaurant,Yoga Studio,Çöp Şiş Place
0,Adalar,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Arnavutköy,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ataşehir,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
3,Avcılar,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
4,Bahçelievler,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
5,Bakırköy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
6,Bayrampaşa,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
7,Bağcılar,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Başakşehir,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01
9,Beykoz,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
# create a new dataframe for vegetarian/ vegan restaurant only
ist_veggie = ist_grouped[["District","Vegetarian / Vegan Restaurant"]]
ist_veggie

Unnamed: 0,District,Vegetarian / Vegan Restaurant
0,Adalar,0.0
1,Arnavutköy,0.0
2,Ataşehir,0.0
3,Avcılar,0.0
4,Bahçelievler,0.0
5,Bakırköy,0.0
6,Bayrampaşa,0.01
7,Bağcılar,0.0
8,Başakşehir,0.0
9,Beykoz,0.0


### Cluster districts

In [88]:
# set number of clusters
kclusters = 2

ist_clustering = ist_veggie.drop(["District"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ist_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=int32)

In [89]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
ist_merged = ist_veggie.copy()

# add clustering labels
ist_merged["Cluster Labels"] = kmeans.labels_

In [90]:
# merge ist_grouped with toronto_data to add latitude/longitude for each neighborhood
ist_merged = ist_merged.join(istanbul_data.set_index("District"), on="District")

print(ist_merged.shape)
ist_merged.head() # check the last columns!

(39, 9)


Unnamed: 0,District,Vegetarian / Vegan Restaurant,Cluster Labels,Population,Area(km2),Density(per km2),Annual Household Income($),Latitude,Longitude
0,Adalar,0.0,0,16033,11.05,1451,10978,40.86306,29.12423
1,Arnavutköy,0.0,0,296709,450.35,659,3350,41.18558,28.74147
2,Ataşehir,0.0,0,422594,25.23,16750,10854,40.99248,29.12777
3,Avcılar,0.0,0,436897,42.01,10400,6064,40.97813,28.72101
4,Bahçelievler,0.0,0,592371,16.62,35642,7741,40.99805,28.84737


In [91]:
# sort the results by Cluster Labels
print(ist_merged.shape)
# istanbul_data.astype({'Annual Household Income($)': 'int32'})
ist_merged.sort_values(["Annual Household Income($)"], inplace=True)
ist_merged

(39, 9)


Unnamed: 0,District,Vegetarian / Vegan Restaurant,Cluster Labels,Population,Area(km2),Density(per km2),Annual Household Income($),Latitude,Longitude
2,Ataşehir,0.0,0,422594,25.23,16750,10854.0,40.99248,29.12777
0,Adalar,0.0,0,16033,11.05,1451,10978.0,40.86306,29.12423
36,Üsküdar,0.0,0,520771,35.33,14740,11572.0,41.02733,29.01566
27,Sarıyer,0.0,0,335298,175.39,1912,12104.0,41.15998,29.0418
38,Şişli,0.01,1,266793,10.71,24911,12955.0,41.06201,28.98877
5,Bakırköy,0.0,0,226229,29.64,7633,14650.0,40.98273,28.8748
20,Kadıköy,0.0,0,481983,25.09,19210,14948.0,40.98867,29.02732
12,Beşiktaş,0.0,0,176513,18.01,9801,17490.0,41.0419,29.00575
1,Arnavutköy,0.0,0,296709,450.35,659,3350.0,41.18558,28.74147
33,Çatalca,0.0,0,74975,1115.13,67,3524.0,41.14369,28.46035


### visualize the clusters

In [70]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ist_merged['Latitude'], ist_merged['Longitude'], ist_merged['District'], ist_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Observations:

Vegetarian/ vegan restaurants can be find in only 5 districts mapped in cluster 1. 
The remaining districts dont have any vegan/ vegetarian restaurants. 
When we look at top 3 districts based on Annual Household Income($), Besiktas, Kadikoy and Bakiy