# Geodata of Dubai

## Getting the data 

In this first phase, I will create a dataset with Dubai communities and their coordinates, and project these communities to a Folium map of Dubai.

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from geopy.geocoders import Nominatim 

import requests 
import json, lxml
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

try:
    import folium
except:
    !pip install folium
    import folium
    
!pip install geocoder
import geocoder

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 3.3 MB/s  eta 0:00:01
[?25hCollecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 8.4 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [2]:
CLIENT_ID = '-' 
CLIENT_SECRET = '-' 
VERSION = '20201216'

In [3]:
lt = pd.read_html('https://en.wikipedia.org/wiki/List_of_communities_in_Dubai')
df = lt[0]

In [4]:
df = df.drop(labels='Unnamed: 6', axis=1)
df.head()

Unnamed: 0,Community Number,Community (English),Community (Arabic),Area(km2),Population(2000),Population density(/km2)
0,126.0,Abu Hail,أبو هيل,1.27 km²,21414.0,"16,861.4/km²"
1,711.0,Al Awir First,العوير الأولى,,,
2,721.0,Al Awir Second,العوير الثانية,,,
3,283.0,Aleyas,العياص,162.4 km2,1706.0,162.4/km2
4,333.0,Al Bada'a,البدع,0.82 km²,18816.0,22946/km²


In [5]:
df = df.dropna(axis=0, how='any', subset=['Community (English)'])
df.head()

Unnamed: 0,Community Number,Community (English),Community (Arabic),Area(km2),Population(2000),Population density(/km2)
0,126.0,Abu Hail,أبو هيل,1.27 km²,21414.0,"16,861.4/km²"
1,711.0,Al Awir First,العوير الأولى,,,
2,721.0,Al Awir Second,العوير الثانية,,,
3,283.0,Aleyas,العياص,162.4 km2,1706.0,162.4/km2
4,333.0,Al Bada'a,البدع,0.82 km²,18816.0,22946/km²


In [6]:
def getCoordsByPostalCode(community):
    # initialize to None : this variable will allow us to loop until geocoder responds with the coordinates
    lat_lng_coords = None

    # loop until we get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Dubai, UAE'.format(community))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

In [7]:
postalCodesWithCoordsList = []

# Loop through each postal code from our dataframe
for community in df['Community (English)']:
    # Fill the temp list with the coordinates from geocoder
    latitude, longitude = getCoordsByPostalCode(community)
    postalCodesWithCoordsList.append([community, latitude, longitude])
    
# Transform the temp list into a dataframe
df_coords = pd.DataFrame(postalCodesWithCoordsList)
df_coords.columns = ['Community (English)', 'Latitude', 'Longitude']

df_coords.head()

Unnamed: 0,Community (English),Latitude,Longitude
0,Abu Hail,25.28308,55.33435
1,Al Awir First,25.18605,55.54108
2,Al Awir Second,25.16792,55.54331
3,Aleyas,25.20292,55.52626
4,Al Bada'a,25.23184,55.27329


In [8]:
df = df.set_index('Community (English)').join(df_coords.set_index('Community (English)'))
df = df.reset_index()
df.head()

Unnamed: 0,Community (English),Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude
0,Abu Hail,126.0,أبو هيل,1.27 km²,21414.0,"16,861.4/km²",25.28308,55.33435
1,Al Awir First,711.0,العوير الأولى,,,,25.18605,55.54108
2,Al Awir Second,721.0,العوير الثانية,,,,25.16792,55.54331
3,Aleyas,283.0,العياص,162.4 km2,1706.0,162.4/km2,25.20292,55.52626
4,Al Bada'a,333.0,البدع,0.82 km²,18816.0,22946/km²,25.23184,55.27329


In [9]:
df_new = df.rename(columns={'Community (English)': 'Community'})
df_new.head()

Unnamed: 0,Community,Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude
0,Abu Hail,126.0,أبو هيل,1.27 km²,21414.0,"16,861.4/km²",25.28308,55.33435
1,Al Awir First,711.0,العوير الأولى,,,,25.18605,55.54108
2,Al Awir Second,721.0,العوير الثانية,,,,25.16792,55.54331
3,Aleyas,283.0,العياص,162.4 km2,1706.0,162.4/km2,25.20292,55.52626
4,Al Bada'a,333.0,البدع,0.82 km²,18816.0,22946/km²,25.23184,55.27329


In [10]:
map_dubai = folium.Map(location=[25.276987, 55.296249], zoom_start=10)

for lat, lng, community in zip(df_new['Latitude'], df_new['Longitude'], df_new['Community']):
    label = '{}'.format(community)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dubai)  
    
map_dubai

In [11]:
from project_lib import Project
project = Project(project_id='93dd035c-7570-4908-b041-cf9eab8a1be6', project_access_token='p-755f8f8d33afee0673da7402cabcb6383bbce707')
pc = project.project_context

In [12]:
project.save_data(data=df_new.to_csv(index=False),file_name='Communities dataset.csv',overwrite=True)


{'file_name': 'Communities dataset.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': '74df44cf-2b2c-471e-ad64-5b2366bca49f'}

In [None]:
# token
p-755f8f8d33afee0673da7402cabcb6383bbce707
# id
93dd035c-7570-4908-b041-cf9eab8a1be6

In [None]:
&categoryId=4d4b7105d754a06377d81259

## Venues data from Foursquare

In [20]:
CONST_venuesRadiusScan = 1100
CONST_venuesLimit = 100

In [31]:
def getNearbyVenues(communities, latitudes, longitudes):
    
    venues_list=[]
    # Loop through each community given in parameters
    for community, lat, lng in zip(communities, latitudes, longitudes):
            
        # create the API request URL to explore the neighbourhood using FoursquareAPI
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId=4d4b7105d754a06377d81259'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            CONST_venuesRadiusScan, 
            CONST_venuesLimit)

        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue : name, latitude, longitude, and the categories' names
        venues_list.append([(
            community, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    # add the venues in the dataframe
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
                        'Community',
                        'Community Latitude', 
                        'Community Longitude', 
                        'Venue', 
                        'Venue Latitude', 
                        'Venue Longitude', 
                        'Venue Category'
    ]
    
    return(nearby_venues)

In [32]:
dubai_venues = getNearbyVenues(  
                                    communities=df_new['Community'],
                                    latitudes=df_new['Latitude'],
                                    longitudes=df_new['Longitude']
                                  )

dubai_venues.head()

Unnamed: 0,Community,Community Latitude,Community Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Abu Hail,25.28308,55.33435,Gold's Gym,25.282698,55.341019,Gym
1,Abu Hail,25.28308,55.33435,Bait Al Jinnie Junction,25.280546,55.330471,Scenic Lookout
2,Abu Hail,25.28308,55.33435,Hamriya Park,25.28571,55.333,Park
3,Abu Hail,25.28308,55.33435,Pond Park - Al Qusais,25.28806,55.332606,Park
4,Abu Hail,25.28308,55.33435,Muteena Jogging Park,25.278762,55.326875,Park


In [44]:
project.save_data(data=dubai_venues.to_csv(index=False),file_name='Dubai venues dataset.csv',overwrite=True)

{'file_name': 'Dubai venues dataset.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': 'e122c987-37e0-4d43-9680-4a292b05dd4f'}

Total venues = 6152
Venues in the "Outdoors and recreation" top category = 1332

## Group venues

In [45]:
map_venues = folium.Map(location=[25.276987, 55.296249], zoom_start=10)
map_venues

for lat, lng, community in zip(dubai_venues['Venue Latitude'], dubai_venues['Venue Longitude'], dubai_venues['Venue Category']):
    label = '{}'.format(community)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        parse_html=False).add_to(map_venues)  
    
map_venues

In [43]:
# get counts per venue
venue_counts = dubai_venues.groupby(['Venue Category']).agg(['count'])
project.save_data(data=venue_counts.to_csv(index=False),file_name='Venue counts.csv',overwrite=True)

{'file_name': 'Venue counts.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': 'ee398347-0b22-4354-9bbf-0b7dc6536dd8'}

In [46]:
dubai_venues.head()

Unnamed: 0,Community,Community Latitude,Community Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Abu Hail,25.28308,55.33435,Gold's Gym,25.282698,55.341019,Gym
1,Abu Hail,25.28308,55.33435,Bait Al Jinnie Junction,25.280546,55.330471,Scenic Lookout
2,Abu Hail,25.28308,55.33435,Hamriya Park,25.28571,55.333,Park
3,Abu Hail,25.28308,55.33435,Pond Park - Al Qusais,25.28806,55.332606,Park
4,Abu Hail,25.28308,55.33435,Muteena Jogging Park,25.278762,55.326875,Park


In [60]:
def group (row):
   if row['Venue Category'] == 'River' :
      return 'A'
   if row['Venue Category'] == 'Park' :
      return 'A'
   if row['Venue Category'] == 'Beach' :
      return 'A'
   if row['Venue Category'] == 'Cave' :
      return 'A'
   if row['Venue Category'] == 'Waterfront' :
      return 'A'
   if row['Venue Category'] == 'Canal' :
      return 'A'
   if row['Venue Category'] == 'Tree' :
      return 'A'
   if row['Venue Category'] == 'Nature Preserve' :
      return 'A'
   if row['Venue Category'] == 'Lake' :
      return 'A'
   if row['Venue Category'] == 'Mountain' :
      return 'A'   
   if row['Venue Category'] == 'Gym' :
      return 'B'
   if row['Venue Category'] == 'Scenic Lookout' :
      return 'B'
   if row['Venue Category'] == 'Pool' :
      return 'B'
   if row['Venue Category'] == 'Ski Area' :
      return 'B'
   if row['Venue Category'] == 'Athletics & Sports' :
      return 'B'
   if row['Venue Category'] == 'Gym / Fitness Center' :
      return 'B'
   if row['Venue Category'] == 'Skate Park' :
      return 'B'
   if row['Venue Category'] == 'Soccer Field' :
      return 'B'  
   if row['Venue Category'] == 'Tennis Court' :
      return 'B'
   if row['Venue Category'] == 'Pilates Studio' :
      return 'B'
   if row['Venue Category'] == 'Yoga Studio' :
      return 'B'
   if row['Venue Category'] == 'Botanical Garden' :
      return 'B'
   if row['Venue Category'] == 'Garden' :
      return 'B'
   if row['Venue Category'] == 'Playground' :
      return 'B'
   if row['Venue Category'] == 'Sports Club' :
      return 'B'
   if row['Venue Category'] == 'National Park' :
      return 'B'   
   if row['Venue Category'] == 'Skating Rink' :
      return 'B'
   if row['Venue Category'] == 'Boxing Gym' :
      return 'B'
   if row['Venue Category'] == 'Roof Deck' :
      return 'B'
   if row['Venue Category'] == 'Recreation Center' :
      return 'B'
   if row['Venue Category'] == 'Gymnastics Gym' :
      return 'B'
   if row['Venue Category'] == 'Martial Arts School' :
      return 'B'
   if row['Venue Category'] == 'Farm' :
      return 'B'
   if row['Venue Category'] == 'Stables' :
      return 'B'
   if row['Venue Category'] == 'Indoor Play Area' :
      return 'B'
   if row['Venue Category'] == 'Baseball Field' :
      return 'B'
   if row['Venue Category'] == 'Badminton Court' :
      return 'B'
   if row['Venue Category'] == 'Surf Spot' :
      return 'B'
   if row['Venue Category'] == 'Golf Course' :
      return 'B'
   if row['Venue Category'] == 'Cycle Studio' :
      return 'B'
   if row['Venue Category'] == 'Fountain' :
      return 'B'
   if row['Venue Category'] == 'Castle' :
      return 'B'   
   if row['Venue Category'] == 'Skydiving Drop Zone' :
      return 'B'
   if row['Venue Category'] == 'Lighthouse' :
      return 'B'
   if row['Venue Category'] == 'Hot Spring' :
      return 'B'
   if row['Venue Category'] == 'Paintball Field' :
      return 'B'
   if row['Venue Category'] == 'Volleyball Court' :
      return 'B' 
   if row['Venue Category'] == 'Basketball Court' :
      return 'B'    
   if row['Venue Category'] == 'Plaza' :
      return 'C'
   if row['Venue Category'] == 'Harbor / Marina' :
      return 'C'
   if row['Venue Category'] == 'Trail' :
      return 'C'
   if row['Venue Category'] == 'Bike Trail' :
      return 'C'
   if row['Venue Category'] == 'Track' :
      return 'C'
   if row['Venue Category'] == 'Pedestrian Plaza' :
      return 'C'
   return 'Other'

In [62]:
dubai_venues['Group'] = dubai_venues.apply (lambda row: group(row), axis=1)
dubai_venues.head(20)

Unnamed: 0,Community,Community Latitude,Community Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Group
0,Abu Hail,25.28308,55.33435,Gold's Gym,25.282698,55.341019,Gym,B
1,Abu Hail,25.28308,55.33435,Bait Al Jinnie Junction,25.280546,55.330471,Scenic Lookout,B
2,Abu Hail,25.28308,55.33435,Hamriya Park,25.28571,55.333,Park,A
3,Abu Hail,25.28308,55.33435,Pond Park - Al Qusais,25.28806,55.332606,Park,A
4,Abu Hail,25.28308,55.33435,Muteena Jogging Park,25.278762,55.326875,Park,A
5,Abu Hail,25.28308,55.33435,Old Memzar Corniche,25.290534,55.329992,Beach,A
6,Abu Hail,25.28308,55.33435,Lively,25.285194,55.325276,Track,C
7,Abu Hail,25.28308,55.33435,Hor Al Anz Park,25.274142,55.335593,Park,A
8,Al Awir First,25.18605,55.54108,El 3zbeh,25.180099,55.544318,Farm,B
9,Al Awir Second,25.16792,55.54331,Aweer farmhouse,25.1749,55.550523,Farm,B


## Cluster communities

In [102]:
# get dummy variables for all venues

dubai_dummies = pd.get_dummies(dubai_venues[['Venue Category']], prefix="", prefix_sep="")
dubai_dummies['Community'] = dubai_venues['Community'] 
dubai_dummies.head()

Unnamed: 0,Athletics & Sports,Badminton Court,Baseball Field,Basketball Court,Beach,Bike Trail,Botanical Garden,Boxing Gym,Campground,Canal,...,Stables,Surf Spot,Tennis Court,Track,Trail,Tree,Volleyball Court,Waterfront,Yoga Studio,Community
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Abu Hail
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Abu Hail
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Abu Hail
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Abu Hail
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Abu Hail


In [103]:
dubai_grouped = dubai_dummies.groupby(['Community']).mean().reset_index()
dubai_grouped.head()

Unnamed: 0,Community,Athletics & Sports,Badminton Court,Baseball Field,Basketball Court,Beach,Bike Trail,Botanical Garden,Boxing Gym,Campground,...,Sports Club,Stables,Surf Spot,Tennis Court,Track,Trail,Tree,Volleyball Court,Waterfront,Yoga Studio
0,Abu Hail,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0
1,Al Awir First,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Al Awir Second,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Al Bada'a,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0
4,Al Baraha,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [107]:
num_top_venues = 5
CONST_dfColumns = ['Community']
# Iterate through all the grouped dataframe
for index, row in dubai_grouped.iterrows():
    tempCommunity = row['Community']
    
    print("----"+tempCommunity +"----")
    
    # Create a temp df filtered on the current neighbourhood (key : postal code x borouhg x neighbourhood)
    temp = dubai_grouped[
        (dubai_grouped.Community == tempCommunity)
    ].T.reset_index()
    
    temp.columns = ['venue','freq']

    # We skip the key PostalCode x Borough x Neighbourhood : length = 3, iloc[3:]
    temp = temp.iloc[len(CONST_dfColumns):]
    temp['freq'] = temp['freq'].astype(float)
    
    # Round the frequency with two digits
    temp = temp.round({'freq': 2})
    
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Abu Hail----
            venue  freq
0            Park  0.50
1           Beach  0.12
2           Track  0.12
3             Gym  0.12
4  Scenic Lookout  0.12


----Al Awir First----
              venue  freq
0              Farm   1.0
1      Skating Rink   0.0
2   Paintball Field   0.0
3              Park   0.0
4  Pedestrian Plaza   0.0


----Al Awir Second----
              venue  freq
0              Farm   1.0
1      Skating Rink   0.0
2   Paintball Field   0.0
3              Park   0.0
4  Pedestrian Plaza   0.0


----Al Bada'a----
              venue  freq
0               Gym  0.45
1              Pool  0.18
2              Park  0.09
3  Basketball Court  0.09
4  Volleyball Court  0.09


----Al Baraha----
                venue  freq
0                Park   0.5
1               Track   0.5
2  Athletics & Sports   0.0
3          Skate Park   0.0
4     Paintball Field   0.0


----Al Barsha First----
                  venue  freq
0                  Pool  0.35
1  Gym / Fitness Center  0.2

                  venue  freq
0                  Park  0.33
1  Gym / Fitness Center  0.33
2                   Gym  0.33
3    Athletics & Sports  0.00
4            Skate Park  0.00


----Al Qusais Industrial Fourth----
                  venue  freq
0  Gym / Fitness Center  0.50
1   Martial Arts School  0.25
2                   Gym  0.25
3    Athletics & Sports  0.00
4       Nature Preserve  0.00


----Al Qusais Industrial Second----
                  venue  freq
0  Gym / Fitness Center  0.50
1                  Pool  0.25
2                   Gym  0.25
3    Athletics & Sports  0.00
4            Skate Park  0.00


----Al Qusais Industrial Third----
                  venue  freq
0   Martial Arts School  0.33
1  Gym / Fitness Center  0.33
2                   Gym  0.33
3    Athletics & Sports  0.00
4       Nature Preserve  0.00


----Al Qusais Second----
                 venue  freq
0                 Park   0.4
1     Basketball Court   0.2
2  Martial Arts School   0.2
3                  Gym  



----Muhaisnah First----
                  venue  freq
0                   Gym  0.57
1      Indoor Play Area  0.14
2          Soccer Field  0.14
3  Gym / Fitness Center  0.14
4    Athletics & Sports  0.00


----Nad Al Hammar----
                venue  freq
0    Basketball Court   0.5
1                Park   0.5
2  Athletics & Sports   0.0
3        Skating Rink   0.0
4     Paintball Field   0.0


----Nad Shamma----
                  venue  freq
0  Gym / Fitness Center  0.33
1       Badminton Court  0.17
2                 Beach  0.17
3           Sports Club  0.17
4                   Gym  0.17


----Nadd Al Shiba Fourth----
                venue  freq
0                Park   1.0
1  Athletics & Sports   0.0
2       National Park   0.0
3     Paintball Field   0.0
4    Pedestrian Plaza   0.0


----Nadd Al Shiba Second----
                venue  freq
0                Pool  0.25
1           Surf Spot  0.25
2              Castle  0.25
3                 Gym  0.25
4  Athletics & Sports  0.00


-

In [108]:
def return_most_common_venues(row, num_top_venues):
    # Remove the key PostalCode x Borough x Neighbourhood from the row
    row_categories = row.iloc[len(CONST_dfColumns):]
    
    # Sort ascending
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    # Return the top num_top_venues
    return row_categories_sorted.index.values[0:num_top_venues]

In [110]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Community']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

communities_venues_sorted = pd.DataFrame(columns=columns)

communities_venues_sorted['Community'] = dubai_grouped['Community']

for ind in np.arange(dubai_grouped.shape[0]):
    communities_venues_sorted.iloc[ind, len(CONST_dfColumns):] = return_most_common_venues(dubai_grouped.iloc[ind, :], num_top_venues)

communities_venues_sorted.head()

Unnamed: 0,Community,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abu Hail,Park,Scenic Lookout,Track,Beach,Gym,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym
1,Al Awir First,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
2,Al Awir Second,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
3,Al Bada'a,Gym,Pool,Volleyball Court,Basketball Court,Park,Plaza,Yoga Studio,Farm,Hot Spring,Harbor / Marina
4,Al Baraha,Track,Park,Yoga Studio,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center


In [111]:
project.save_data(data=communities_venues_sorted.to_csv(index=False),file_name='Most common venues for kcluster.csv',overwrite=True)

{'file_name': 'Most common venues for kcluster.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': '801a4d16-c907-4977-b315-f59032c28811'}

### Cluster neighbourhoods based on most common venues

In [115]:
# set number of clusters
kclusters = 5

dubai_grouped_clustering = dubai_grouped.drop('Community', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(dubai_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 4, 4, 2, 0, 1, 2, 1, 0, 0], dtype=int32)

In [116]:
communities_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [120]:
dubai_merged = df_new

dubai_merged = dubai_merged.join(communities_venues_sorted.set_index(['Community']), on=['Community'])

dubai_merged.head() # check the last columns!

Unnamed: 0,Community,Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abu Hail,126.0,أبو هيل,1.27 km²,21414.0,"16,861.4/km²",25.28308,55.33435,0.0,Park,Scenic Lookout,Track,Beach,Gym,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym
1,Al Awir First,711.0,العوير الأولى,,,,25.18605,55.54108,4.0,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
2,Al Awir Second,721.0,العوير الثانية,,,,25.16792,55.54331,4.0,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
3,Aleyas,283.0,العياص,162.4 km2,1706.0,162.4/km2,25.20292,55.52626,1.0,Stables,Yoga Studio,Dog Run,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
4,Al Bada'a,333.0,البدع,0.82 km²,18816.0,22946/km²,25.23184,55.27329,2.0,Gym,Pool,Volleyball Court,Basketball Court,Park,Plaza,Yoga Studio,Farm,Hot Spring,Harbor / Marina


In [121]:
project.save_data(data=dubai_merged.to_csv(index=False),file_name='Communities with K-labels.csv',overwrite=True)

{'file_name': 'Communities with K-labels.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': 'eb7a5cb2-f0bb-46a2-941f-092dabf6a937'}

### Let's take a look at the clusters:

In [130]:
dubai_merged.loc[dubai_merged['Cluster Labels'] == 0.0]

Unnamed: 0,Community,Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abu Hail,126.0,أبو هيل,1.27 km²,21414,"16,861.4/km²",25.28308,55.33435,0.0,Park,Scenic Lookout,Track,Beach,Gym,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym
5,Al Baraha,122.0,البراحة,1.104 km²,7823,"7,086/km²",25.28286,55.31674,0.0,Track,Park,Yoga Studio,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
9,Al Barsha South First,671.0,البرشاء جنوب الاولى,38.1 km²,1248,33/km²,25.08958,55.23424,0.0,Park,Tennis Court,Yoga Studio,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
12,Al Barsha South Fourth,681.0,البرشاء جنوب الرابعة,38.1 km2,1248,33/km2,25.05521,55.20885,0.0,Park,Gym,Yoga Studio,Farm,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool
36,Al Mizhar Second,263.0,المزهر الثانية,11.2 km²,4326,386.25/km²,25.25236,55.45677,0.0,Park,Campground,Yoga Studio,Farm,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool
57,Al Qusais Second,233.0,القصيص الثانية,,7657,,25.26563,55.38771,0.0,Park,Basketball Court,Gym,Martial Arts School,Yoga Studio,Fountain,Lake,Indoor Play Area,Hot Spring,Harbor / Marina
58,Al Qusais Third,234.0,القصيص الثالثة,,7506,,25.26064,55.39643,0.0,Park,Basketball Court,Yoga Studio,Farm,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool
72,Al Twar Second,227.0,الطوار الثانية,6.8km²,18457,"2,714/km²",25.26141,55.38141,0.0,Park,Basketball Court,Gym / Fitness Center,Gym,Yoga Studio,Farm,Lake,Indoor Play Area,Hot Spring,Harbor / Marina
75,Al Warqa'a First,421.0,الورقاء الأولى,21.6 km²,5,0.23/km²,25.19409,55.40157,0.0,Park,Yoga Studio,Farm,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
77,Al Warqa'a Second,422.0,الورقاء الثانية,21.6 km²,5,0.23/km²,25.19511,55.40933,0.0,Park,Stables,Gym,Yoga Studio,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool


In [137]:
Q1 = dubai_merged.loc[dubai_merged['Cluster Labels'] == 1.0]
Q1

Unnamed: 0,Community,Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Aleyas,283.0,العياص,162.4 km2,1706.0,162.4/km2,25.20292,55.52626,1.0,Stables,Yoga Studio,Dog Run,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
6,Al Barsha First,373.0,البرشاء الأولى,38.1 km²,1248.0,33/km²,25.11483,55.19136,1.0,Pool,Gym / Fitness Center,Gym,Park,Skate Park,Ski Area,Soccer Field,Athletics & Sports,Dog Run,Golf Course
8,Al Barsha Third,375.0,البرشاء الثالثة,38.1 km2,1248.0,33/km2,25.09342,55.19044,1.0,Yoga Studio,Tennis Court,Pilates Studio,Dog Run,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
11,Al Barsha South Third,673.0,البرشاء جنوب الثالثة,38.1 km²,1248.0,33/km²,25.06229,55.23995,1.0,Botanical Garden,Gym / Fitness Center,Garden,Yoga Studio,Farm,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym
13,Al Barsha South Fifth,684.0,البرشاء جنوب الخامسة,3.81 km2,1248.0,33/km2,25.04238,55.1855,1.0,Basketball Court,Park,Sports Club,Playground,Garden,Yoga Studio,Farm,Indoor Play Area,Hot Spring,Harbor / Marina
14,Al Buteen,114.0,البطين,0.07 km²,2364.0,"33,771/km²",25.26925,55.29944,1.0,Beach,Plaza,National Park,Park,Gym / Fitness Center,Gym,Yoga Studio,Fountain,Indoor Play Area,Hot Spring
15,Al Corniche,111.0,الكورنيش,2.90 km2,1135.0,"2,890/km2",25.28114,55.30684,1.0,Beach,Park,Gym,Pool,Skating Rink,Yoga Studio,Farm,Indoor Play Area,Hot Spring,Harbor / Marina
16,Al Dhagaya,113.0,الضغاية,0.125 km²,10896.0,"21,451/km²",25.27217,55.30157,1.0,Beach,Plaza,National Park,Park,Gym / Fitness Center,Gym,Skating Rink,Pool,Yoga Studio,Fountain
19,Al Guoz Fourth,359.0,القوز الرابعة,,,,25.26951,55.30884,1.0,Plaza,Gym,Park,Gym / Fitness Center,Harbor / Marina,Track,Tennis Court,Beach,National Park,Skating Rink
21,Al Hamriya Port,131.0,ميناء الحمرية,0.89 km²,83.0,93.25/km²,25.29871,55.33546,1.0,Beach,Harbor / Marina,Stables,Yoga Studio,Farm,Lake,Indoor Play Area,Hot Spring,Gymnastics Gym,Gym Pool


In [138]:
project.save_data(data=Q1.to_csv(index=False),file_name='Q1.csv',overwrite=True)

{'file_name': 'Q1.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': '77b881d4-e803-4c39-9a5d-09319cc6cd58'}

In [139]:
Q2 = dubai_merged.loc[dubai_merged['Cluster Labels'] == 2.0]
Q2

Unnamed: 0,Community,Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Al Bada'a,333.0,البدع,0.82 km²,18816,22946/km²,25.23184,55.27329,2.0,Gym,Pool,Volleyball Court,Basketball Court,Park,Plaza,Yoga Studio,Farm,Hot Spring,Harbor / Marina
7,Al Barsha Second,376.0,البرشاء الثانية,38.1 km²,1248,33/km²,25.10723,55.20485,2.0,Gym,Gym / Fitness Center,Athletics & Sports,Park,Tennis Court,Pool,Skate Park,Dog Run,Hot Spring,Harbor / Marina
10,Al Barsha South Second,672.0,البرشاء جنوب الثانية,38.1 km²,1248,33/km²,25.07739,55.24267,2.0,Pool,Athletics & Sports,Gym / Fitness Center,Gym,Farm,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym
18,Al Garhoud,214.0,القرهود,4 km²,4466,"1,116.5/km²",25.24337,55.35267,2.0,Gym / Fitness Center,Gym,Basketball Court,Tennis Court,Park,Plaza,Pool,Yoga Studio,Farm,Hot Spring
20,"Al Hamriya, Dubai",313.0,الحمرية,0.72 km²,15104,"20,890/km²",25.25696,55.30246,2.0,Gym,Beach,Gym / Fitness Center,Plaza,National Park,Harbor / Marina,Pool,Yoga Studio,Fountain,Indoor Play Area
22,Al Hudaiba,322.0,الحضيبة,0.84 km²,7699,"9,165/km²",25.23713,55.27707,2.0,Gym,Gym / Fitness Center,Volleyball Court,Basketball Court,Harbor / Marina,Plaza,Pool,Yoga Studio,Farm,Indoor Play Area
24,Al Jafiliya,323.0,الجافلية,1.63 km²,11619,"7,128/km²",25.23342,55.29001,2.0,Gym / Fitness Center,Gym,Athletics & Sports,Basketball Court,Park,Dog Run,Track,Volleyball Court,Harbor / Marina,Farm
25,Al Karama,318.0,الكرامة,1.509 km2,45674,"30,267/km2",25.24529,55.30364,2.0,Gym,Park,Gym / Fitness Center,Dog Run,Soccer Field,Plaza,Farm,Indoor Play Area,Hot Spring,Harbor / Marina
26,Al Khabisi,128.0,الخبيصي,1.255 km²,6737,"5,368/km²",25.27177,55.33762,2.0,Gym / Fitness Center,Gym,Park,Campground,Pool,Yoga Studio,Farm,Indoor Play Area,Hot Spring,Harbor / Marina
29,Al Kifaf,324.0,الكفاف,0.8 km²,35,44/km²,25.2381,55.29778,2.0,Park,Gym,Dog Run,Track,Plaza,Farm,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym


In [140]:
project.save_data(data=Q2.to_csv(index=False),file_name='Q2.csv',overwrite=True)

{'file_name': 'Q2.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': 'c845a6e3-5502-4392-8bbe-a2bc0b813cec'}

In [133]:
dubai_merged.loc[dubai_merged['Cluster Labels'] == 3.0]

Unnamed: 0,Community,Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Al Fagaa,945.0,الفقع,2.811 km2,22,2.811/km2,24.72179,55.62126,3.0,Scenic Lookout,Yoga Studio,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center,Gym
74,Al Warqa'a Fifth,425.0,الورقاء الخامسة,21.6 km²,5,0.23/km²,25.19216,55.44937,3.0,Scenic Lookout,Yoga Studio,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center,Gym
76,Al Warqa'a Fourth,424.0,الورقاء الرابعة,21.6 km²,5,0.23/km²,25.18917,55.44016,3.0,Scenic Lookout,Yoga Studio,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center,Gym


In [134]:
dubai_merged.loc[dubai_merged['Cluster Labels'] == 4.0]

Unnamed: 0,Community,Community Number,Community (Arabic),Area(km2),Population(2000),Population density(/km2),Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Al Awir First,711.0,العوير الأولى,,,,25.18605,55.54108,4.0,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
2,Al Awir Second,721.0,العوير الثانية,,,,25.16792,55.54331,4.0,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
131,Wadi Alamardi,271.0,وادي العمردي,,,,25.21703,55.48636,4.0,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
140,Umm Nahad Third,913.0,,,,,25.03249,55.44087,4.0,Farm,Campground,Yoga Studio,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center
145,Lehbab First,731.0,,,,,25.05696,55.59867,4.0,Farm,Yoga Studio,Martial Arts School,Lake,Indoor Play Area,Hot Spring,Harbor / Marina,Gymnastics Gym,Gym Pool,Gym / Fitness Center


## Design maps and visuals

In [None]:
a_venues = dubai_venues.loc[dubai_venues['Group'] == 'A']
b_venues = dubai_venues.loc[dubai_venues['Group'] == 'B']
c_venues = dubai_venues.loc[dubai_venues['Group'] == 'C']

In [71]:
map_group_a = folium.Map(location=[25.276987, 55.296249], zoom_start=10)
map_group_a

for lat, lng, community in zip(a_venues['Venue Latitude'], a_venues['Venue Longitude'], a_venues['Venue Category']):
    label = '{}'.format(community)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        parse_html=False).add_to(map_group_a)  
    
map_group_a

In [72]:
map_group_b = folium.Map(location=[25.276987, 55.296249], zoom_start=10)
map_group_b

for lat, lng, community in zip(b_venues['Venue Latitude'], b_venues['Venue Longitude'], b_venues['Venue Category']):
    label = '{}'.format(community)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='yellow',
        fill=True,
        parse_html=False).add_to(map_group_b)  
    
map_group_b

In [74]:
map_group_c = folium.Map(location=[25.276987, 55.296249], zoom_start=10)
map_group_c

for lat, lng, community in zip(c_venues['Venue Latitude'], c_venues['Venue Longitude'], c_venues['Venue Category']):
    label = '{}'.format(community)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='grey',
        fill=True,
        parse_html=False).add_to(map_group_c)  
    
map_group_c

In [66]:
# get counts per venue group
group_counts = dubai_venues.groupby(['Group']).agg(['count'])
project.save_data(data=group_counts.to_csv(index=False),file_name='Venue Group counts.csv',overwrite=True)
group_counts

In [92]:
# get counts of venues per community
venues_per_community = dubai_venues.groupby(['Community']).agg(['count'])
venues_per_community = venues_per_community.drop(columns=['Community Latitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category', 'Community Longitude'])
venues_per_community.head()

Unnamed: 0_level_0,Group
Unnamed: 0_level_1,count
Community,Unnamed: 1_level_2
Abu Hail,8
Al Awir First,1
Al Awir Second,1
Al Bada'a,11
Al Baraha,2


In [95]:
project.save_data(data=venues_per_community.to_csv(index=False),file_name='Venues in communities counts.csv',overwrite=True)

{'file_name': 'Venues in communities counts.csv',
 'message': 'File saved to project storage.',
 'bucket_name': 'ibmdataanalysiscapstone-donotdelete-pr-tkbv7qp1n32y6i',
 'asset_id': '9484e003-f689-4ec5-898f-9847e69dec64'}

In [88]:
venues_per_community.describe()

Unnamed: 0_level_0,Community Latitude,Community Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Group
Unnamed: 0_level_1,count,count,count,count,count,count,count
count,133.0,133.0,133.0,133.0,133.0,133.0,133.0
mean,10.015038,10.015038,10.015038,10.015038,10.015038,10.015038,10.015038
std,11.753133,11.753133,11.753133,11.753133,11.753133,11.753133,11.753133
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,3.0,3.0,3.0,3.0,3.0,3.0,3.0
50%,6.0,6.0,6.0,6.0,6.0,6.0,6.0
75%,12.0,12.0,12.0,12.0,12.0,12.0,12.0
max,75.0,75.0,75.0,75.0,75.0,75.0,75.0


In [124]:
dubai_merged2 = dubai_merged.dropna(subset=['Cluster Labels'])

In [125]:
map_clusters = folium.Map(location=[25.276987, 55.296249], zoom_start=10)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, community, cluster in zip(dubai_merged2['Latitude'], dubai_merged2['Longitude'], dubai_merged2['Community'], dubai_merged2['Cluster Labels']):
    label = folium.Popup(str(community) + ' - Cluster ' + str(cluster), parse_html=True)
    cluster = int(cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters