# Capstone Project - Segmenting and Clustering Neighborhoods in Doha and Dubai

The main objective of this project is to cluster similar neighborhoods in Dubai and Doha, leveraging Foursquare location data. With the final result of the study, people that are moving from Doha to Dubai, and vice versa, will be able to easily find neighborhoods with the characteristics they are looking for in the new city.

### Import Libraries

In [None]:
import requests # library to handle requests

from matplotlib import pyplot as plt
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#handle html data
!pip install bs4
from bs4 import BeautifulSoup

print('Libraries imported.')

### 1. Webscraping Doha and Dubai Neighborhoods

In [None]:
neighborhood_data = pd.DataFrame(columns=["City", "Neighborhood", "Latitude", "Longitude"])

# webscrape neighborhoods from Doha wikipedia page
r = requests.get('https://en.wikipedia.org/wiki/List_of_communities_in_Doha')

soup = BeautifulSoup(r.text.replace('\n', ''), "html.parser") #replaces line break

#finds the correct table based on its class
doha_neighborhood_table = soup.find("table", {"class": "wikitable"})

for row in doha_neighborhood_table.find("tbody").find_all("tr"):
    if not row.find_all("th"): #handle data only if no table head is found
        col = row.find_all("td")
        
        links = col[0].find_all("a", href=True)
        
        for link in links:
            neighborhood = link.text
            
            r = requests.get('https://en.wikipedia.org' + link["href"])
            coordinates = BeautifulSoup(r.text.replace('\n', ''), "html.parser").find("span", {"class": "geo-dec"}).text.split()
            latitude = coordinates[0].replace("°N","")
            longitude = coordinates[1].replace("°E","")
        
            neighborhood_data = neighborhood_data.append({"City":"Doha", "Neighborhood":neighborhood, "Latitude":float(latitude), "Longitude":float(longitude)}, ignore_index=True)
        
        
neighborhood_data.head()

In [None]:
# webscrape neighborhoods from Dubai wikipedia page
r = requests.get('https://en.wikipedia.org/wiki/List_of_communities_in_Dubai')

soup = BeautifulSoup(r.text.replace('\n', ''), "html.parser") #replaces line break

#finds the correct table
dubai_neighborhood_div = soup.find("div", {"aria-labelledby": "Neighbourhoods_and_communities_in_Dubai"})

dubai_neighborhood_tables = dubai_neighborhood_div.find("table")

for item in dubai_neighborhood_tables.find("tbody").find_all("li", attrs = {"class":False}):
    if item.find_all("a"): #handle data only if there is a link to neighborhood page
        links = item.find_all("a", attrs = {"href":True}) 
        
        for link in links:
            neighborhood = link.text

            r = requests.get('https://en.wikipedia.org' + link["href"])
            coordinates = BeautifulSoup(r.text.replace('\n', ''), "html.parser").find("span", {"class": "geo-dec"})
            
            if (coordinates):
                coordinates = coordinates.text.split()
            
                latitude = coordinates[0].replace("°N","")
                longitude = coordinates[1].replace("°E","")

                neighborhood_data = neighborhood_data.append({"City":"Dubai", "Neighborhood":neighborhood, "Latitude":float(latitude), "Longitude":float(longitude)}, ignore_index=True)

neighborhood_data.tail()

Remove duplicates if they exist (e.g.: Dubai-Al Karama is listed twice)

In [None]:
neighborhood_data.drop_duplicates(inplace=True)
neighborhood_data.reset_index(inplace=True, drop=True)
neighborhood_data.to_csv('neighborhood_data.csv')
neighborhood_data.tail()

Import necessary libraries for geolocation and map rendering:

In [None]:
#!conda install -c conda-forge geopy --yes 
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

#!conda install -c conda-forge folium=0.5.0 --yes
!pip install folium
import folium # map rendering library

### Create Maps of Doha and Dubai, marking their neighborhoods:

In [None]:
# get latitude and longitude basedo on an address
address = 'Doha, Qatar'

geolocator = Nominatim(user_agent="doha_explorer")
location = geolocator.geocode(address)
doha_latitude = location.latitude
doha_longitude = location.longitude

# create a map of Doha with the latitude and longitude
doha_map = folium.Map(location=[doha_latitude, doha_longitude], zoom_start=11)

#filter Doha neigborhoods
doha_df = neighborhood_data.loc[neighborhood_data["City"] == "Doha"]

doha_df.dtypes
# add markers to map
for lat, lng, neighborhood in zip(doha_df['Latitude'], doha_df['Longitude'], doha_df['Neighborhood']):
    label = neighborhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(doha_map)  
    
doha_map

Dubai:

In [None]:
# get latitude and longitude basedo on an address
address = 'Dubai, United Arab Emirates'

geolocator = Nominatim(user_agent="dubai_explorer")
location = geolocator.geocode(address)
dubai_latitude = location.latitude
dubai_longitude = location.longitude

# create a map of Dubai with the latitude and longitude
dubai_map = folium.Map(location=[dubai_latitude, dubai_longitude], zoom_start=10
                      )

dubai_df = neighborhood_data.loc[neighborhood_data["City"] == "Dubai"]

# add markers to map
for lat, lng, neighborhood in zip(dubai_df['Latitude'], dubai_df['Longitude'], dubai_df['Neighborhood']):
    label = neighborhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(dubai_map)  
    
dubai_map

### Explore Neighborhoods with Foursquare API

Set API credentials and parameters:

In [None]:
CLIENT_ID = '3DJMYO2CPIQ4U5JSEGR2BGQMA44CF5RTLEPFK0NWPRBIZ5W5' 
CLIENT_SECRET = 'LO5BK0IN5QD2HD0TYIHAA2XNKX4Y5CUFADSZRVJ33400PA5F'
VERSION = '20180605' # Foursquare API version

Create a function to call Foursquare API for each neighborhood:

In [None]:
def getNearbyVenues(cities, names, latitudes, longitudes, radius=1000, limit=100):
    
    venues_list=[]
    for city, name, lat, lng in zip(cities, names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&time=any&day=any'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        foursquare_return = requests.get(url).json();

        try:
            r = requests.get(url).json()["response"]['groups'][0]['items']
        except:
            print('ERROR: Foursquare API response: ' + str(foursquare_return))
            return None
        
        if (len(r) > 10):            
            # return relevant information for each nearby venue. This solution exclude neighborhoods from industrial areas with less the 10 venues
            venues_list.append([(
                city,
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in r])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 'Neighborhood',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Category']
    
    nearby_venues.to_csv('doha_dubai_venues.csv')
    
    return(nearby_venues)

Run the function to process all neighborhoods, and store the returned information in a new dataframe:

In [None]:
try:
    #try to avoid multiple calls to foursquare API to no exceed quota
    doha_dubai_venues = pd.read_csv('doha_dubai_venues.csv', 0)
    print('Finished! (CSV File)')
except:
    doha_dubai_venues = getNearbyVenues(cities=neighborhood_data['City'],
                                        names=neighborhood_data['Neighborhood'],
                                        latitudes=neighborhood_data['Latitude'],
                                        longitudes=neighborhood_data['Longitude']
                                          )
    print('Finished! (Foursquare API)')

In [None]:
doha_dubai_venues.shape

We can check how many venues were returned for each neighborhood:

In [None]:
df_total = doha_dubai_venues.groupby(['City', 'Neighborhood']).size()
df_total

In order to apply algorithms, we must convert categorical variable into dummy/indicator variables:

In [None]:
#create a new dataframe, converting categories into indicator variables
# one hot encoding
doha_dubai_onehot = pd.get_dummies(doha_dubai_venues[['Venue Category']], prefix="", prefix_sep="")

# add city and neighborhood columns to dataframe
doha_dubai_onehot['Neighborhood'] = doha_dubai_venues['Neighborhood']
doha_dubai_onehot['City'] = doha_dubai_venues['City']

doha_dubai_onehot.head()

In [None]:
doha_dubai_onehot.shape

Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category:

In [None]:
doha_dubai_grouped = doha_dubai_onehot.groupby(['City', 'Neighborhood']).mean().reset_index()
doha_dubai_grouped

Let's create a function to sort venues in descending order:

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[2:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Store storted venues in a data frame:

In [None]:
import numpy as np

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City', 'Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['City'] = doha_dubai_grouped['City']
neighborhoods_venues_sorted['Neighborhood'] = doha_dubai_grouped['Neighborhood']

for ind in np.arange(doha_dubai_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 2:] = return_most_common_venues(doha_dubai_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

### Cluster Neighborhoods

Find the best K:

In [None]:
import matplotlib.pyplot as plt
#%matplotlib inline    

In [None]:
#plot score of diferente ks
max_range = 10

df = doha_dubai_grouped.drop(['City', 'Neighborhood'], 1)

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

indices = []
scores = []

for kclusters in range(3, max_range) :
    
    # Run k-means clustering
    kmeans = KMeans(n_clusters = kclusters, init = 'k-means++', random_state = 0).fit_predict(df)
    
    # Gets the score for the clustering operation performed
    score = silhouette_score(df, kmeans)
    
    # Appending the index and score to the respective lists
    indices.append(kclusters)
    scores.append(score)
    
    #print('k=' + str(kclusters) + ' score:' + str(score))

plt.figure(figsize=(10,5))
plt.plot(np.arange(3, max_range), scores, 'o-')
plt.xlabel("Clusters")
plt.ylabel("Score")
plt.xticks(np.arange(3, max_range))
plt.show()

In [None]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

kclusters = 5
df = doha_dubai_grouped.drop(['City', 'Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

Add Cluster's labels

In [None]:
neighborhoods_venues_sorted.drop('Cluster Labels', 1, inplace=True)
neighborhoods_venues_sorted.head()

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
doha_dubai_merged = neighborhoods_venues_sorted

# merge dataframes to add latitude/longitude for each neighborhood
doha_dubai_merged = doha_dubai_merged.join(neighborhood_data.set_index(['City', 'Neighborhood']), on=['City', 'Neighborhood'])

In [None]:
doha_dubai_merged.head()

In [None]:
# create map to analyze clusters for both cities
map_clusters_doha = folium.Map(location=[doha_latitude, doha_longitude], zoom_start=11)

doha_merged = doha_dubai_merged.loc[neighborhood_data["City"] == "Doha"]

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(doha_merged['Latitude'], doha_merged['Longitude'], doha_merged['Neighborhood'], doha_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters_doha)
       
map_clusters_doha

In [None]:
map_clusters_dubai = folium.Map(location=[dubai_latitude, dubai_longitude], zoom_start=11)

dubai_merged = doha_dubai_merged.loc[neighborhood_data["City"] == "Dubai"]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dubai_merged['Latitude'], dubai_merged['Longitude'], dubai_merged['Neighborhood'], dubai_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters_dubai)
       
map_clusters_dubai

Neighborhood distribution in clusters:

In [None]:
total_neighborhoods = []
labels = []

for c in range(0, kclusters-1):
    total_dubai = len(dubai_merged.loc[dubai_merged['Cluster Labels'] == c])
    total_doha = len(doha_merged.loc[doha_merged['Cluster Labels'] == c])
    
    total_neighborhoods.append((total_dubai, total_doha))
    labels.append(str(c))   

totals_df = pd.DataFrame({
    'Dubai': [x[0] for x in total_neighborhoods],
    'Doha': [x[1] for x in total_neighborhoods]
}, index = labels)

fig, ax = plt.subplots(figsize=20, facecolor='#000000')
totals_df.plot(kind='bar', color=['#2d1e86', '#f6962b'], rot=0, ax=ax)
plot_conf(ax, xlbl='Cluster', ylbl='Number of neighborhoods', t='')

In [14]:
speed = [0.1, 17.5, 40, 48, 52, 69, 88]
lifespan = [2, 8, 70, 1.5, 25, 12, 28]
index = ['snail', 'pig', 'elephant',
         'rabbit', 'giraffe', 'coyote', 'horse']
df = pd.DataFrame({'speed': speed,
                   'lifespan': lifespan}, index=index)
ax = df.plot.bar(rot=0)

Let's Analyze each cluster:

In [None]:
cluster0 = doha_dubai_merged.loc[doha_dubai_merged['Cluster Labels'] == 0]
cluster0

In [None]:
cluster1 = doha_dubai_merged.loc[doha_dubai_merged['Cluster Labels'] == 1]
cluster1

In [None]:
cluster2 = doha_dubai_merged.loc[doha_dubai_merged['Cluster Labels'] == 2]
cluster2

In [None]:
cluster3 = doha_dubai_merged.loc[doha_dubai_merged['Cluster Labels'] == 3]
cluster3

In [None]:
cluster4 = doha_dubai_merged.loc[doha_dubai_merged['Cluster Labels'] == 4]
cluster4

In [None]:
cluster5 = doha_dubai_merged.loc[doha_dubai_merged['Cluster Labels'] == 5]
cluster5