# Segmenting and Clustering Neighborhoods in Harare

## In this project I explore, segment, and cluster the neighborhoods in the city of Toronto. The neighborhood data though is not readily available on the internet.

### Let get our libraries 

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!pip install geopy 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from geopy.extra.rate_limiter import RateLimiter 
from geopy import distance 

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
from matplotlib import pyplot as plt
%matplotlib inline

# import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report

!pip install yellowbrick
from yellowbrick.cluster import KElbowVisualizer

!pip install folium 
import folium # map rendering library
from folium.plugins import MarkerCluster

import requests 

from bs4 import BeautifulSoup 

! pip install googlemaps
import googlemaps

print('Libraries imported.')

Libraries imported.


### Lets get our data from Wikipedia

In [2]:
harare = pd.read_csv(r'C:\Users\prosper.duwugwani\Desktop\projects\Coursera_Capstone\Harare Suburbs v2.csv')

df = pd.DataFrame(harare)
df.head()
df.shape


(180, 2)

In [3]:
df.head()

Unnamed: 0,Borough,Neighborhood
0,Budiriro,"Budiriro 1,Harare,Zimbabwe"
1,Budiriro,"Budiriro 3,Harare,Zimbabwe"
2,Budiriro,"Budiriro 2,Harare,Zimbabwe"
3,Budiriro,"Budiriro 5,Harare,Zimbabwe"
4,Budiriro,"Budiriro 4,Harare,Zimbabwe"


In [4]:
#group postcode and borough
#df = harare_data.groupby(['Borough'], as_index=False).agg(lambda x: ','.join(x))

#df

In [5]:
gmaps_key = googlemaps.Client(key = "AIzaSyCXmmn-yhrT2Tnc7s2-5jwebjx25Ko7fwA")

In [6]:
df["lat"] = None
df["lng"] = None

for i in range(len(df)):
    geocode_result = gmaps_key.geocode(df.loc[i,'Neighborhood'])
    try:
        lat = geocode_result[0]["geometry"]["location"]["lat"]
        lng = geocode_result[0]["geometry"]["location"]["lng"]
        df.loc[i,'lat'] = lat
        df.loc[i,'lng'] = lng
    except:
        lat = None
        lon = None
        
df

Unnamed: 0,Borough,Neighborhood,lat,lng
0,Budiriro,"Budiriro 1,Harare,Zimbabwe",-17.900426,30.921771
1,Budiriro,"Budiriro 3,Harare,Zimbabwe",-17.898069,30.924164
2,Budiriro,"Budiriro 2,Harare,Zimbabwe",-17.879094,30.935077
3,Budiriro,"Budiriro 5,Harare,Zimbabwe",-17.880828,30.932598
4,Budiriro,"Budiriro 4,Harare,Zimbabwe",-17.889796,30.935999
5,Dzivarasekwa,"Kuwdzana Park,Harare,Zimbabwe",-17.823438,31.047127
6,Dzivarasekwa,"Cold Comfort,Harare,Zimbabwe",-17.839628,30.947775
7,Dzivarasekwa,"Tynwald South,Harare,Zimbabwe",-17.806393,30.944606
8,Dzivarasekwa,"Glaudina ,Harare,Zimbabwe",-17.816536,30.897614
9,Dzivarasekwa,"Warren Park North,Harare,Zimbabwe",-17.832069,30.979028


In [7]:
df.dtypes

Borough         object
Neighborhood    object
lat             object
lng             object
dtype: object

### Preproccessing the data

In [8]:
df.rename(columns = {'lat': 'Latitude'}, inplace = True )
df.rename(columns = {'lng': 'Longitude'}, inplace = True )


In [9]:
df['Latitude'] = df.Latitude.astype(float)


In [10]:
df['Longitude'] = df.Longitude.astype(float)

In [11]:
df.dropna(how = 'any', inplace = True)

In [12]:
df.dtypes


Borough          object
Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object

In [13]:
df.isnull().sum()

Borough         0
Neighborhood    0
Latitude        0
Longitude       0
dtype: int64

In [14]:
address = 'Harare'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Harare are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Harare are -17.831773, 31.045686.


In [20]:
# create map of Harare using latitude and longitude values
map_harare = folium.Map(location=[-17.831773, 31.045686], zoom_start=10)

makerCluster = MarkerCluster().add_to(map_harare)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(makerCluster )  
    
map_harare

Get the neighborhood's latitude and longitude values.

In [16]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Budiriro 1,Harare,Zimbabwe are -17.9004264, 30.9217711.


### Lets set our credentials 

In [17]:
CLIENT_ID = 'G0MONIGTBERK11KC0DJHYSMQBTLHECT0WJPF5TQMDJAJQNPT' # your Foursquare ID
CLIENT_SECRET = 'ZXUA5JKO5IDKZMJGGZE0RMY1IYGHKAJNIDML0WN5RJKV4E5B' # your Foursquare Secret
ACCESS_TOKEN = 'ASFEUSQSBOPDKRF0IJE352I4W0WHUWEUPY1ST0ERPE0GTQBZ' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: G0MONIGTBERK11KC0DJHYSMQBTLHECT0WJPF5TQMDJAJQNPT
CLIENT_SECRET:ZXUA5JKO5IDKZMJGGZE0RMY1IYGHKAJNIDML0WN5RJKV4E5B


### Now lets get the venues in the neighborhoods

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Lets create a new dataframe with the results from the the above query 

In [24]:
harare_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Budiriro 1,Harare,Zimbabwe


KeyError: 'groups'

### Let's check the size of the resulting dataframe

In [23]:
print(harare_venues.shape)
harare_venues.head()

NameError: name 'harare_venues' is not defined

### Let's check how many venues were returned for each neighborhood

In [None]:
harare_venues.groupby('Neighborhood').count()

Let's find out how many unique categories can be curated from all the returned venues

In [None]:
print('There are {} uniques categories.'.format(len(harare_venues['Venue Category'].unique())))

### Analyze Each Neighborhood

In [None]:
# one hot encoding
harare_onehot = pd.get_dummies(harare_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
harare_onehot['Neighborhood'] = harare_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [harare_onehot.columns[-1]] + list(harare_onehot.columns[:-1])
harare_onehot = harare_onehot[fixed_columns]

harare_onehot.head()

In [None]:
harare_onehot.shape

In [None]:
harare_grouped = harare_onehot.groupby('Neighborhood').mean().reset_index()
harare_grouped

### Let's confirm the new size

In [None]:
harare_grouped.shape

### Let's print each neighborhood along with the top 5 most common venues

In [None]:
num_top_venues = 5

for hood in harare_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = harare_grouped[harare_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

### Let's put that into a pandas dataframe

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = harare_grouped['Neighborhood']

for ind in np.arange(harare_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(harare_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

### Cluster Neighborhoods

In [None]:
k_range = range(1,10)
sse = []

for k in k_range:
    km = KMeans(n_clusters = k, random_state=0)
    km.fit(harare_grouped.drop('Neighborhood', 1))
    sse.append(km.inertia_)


In [None]:
plt.xlabel('K')
plt.ylabel('Sum of Squared Error')
plt.plot(k_range,sse)

In [None]:
# set number of clusters
kclusters = 4

harare_grouped_clustering = harare_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(harare_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [None]:

# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

harare_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
harare_merged = harare_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


In [None]:
harare_merged.dtypes

In [None]:
#harare_merged['Cluster Labels'] = harare_merged['Cluster Labels'].replace(np.nan, 5)

In [None]:
harare_merged['Cluster Labels'] = pd.to_numeric(harare_merged['Cluster Labels'].round(0), errors='coerce', downcast = 'integer')

In [None]:
harare_merged.dtypes

### Finally, let's visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[-17.831773, 31.045686], tiles = 'OpenStreetMap', zoom_start=10)

makerCluster = MarkerCluster().add_to(map_clusters)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(harare_merged['Latitude'], harare_merged['Longitude'], harare_merged['Neighborhood'], harare_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        #color=rainbow[cluster-1],
        fill=True,
        #fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(makerCluster)
       
map_clusters

### Examine Cluster 1

In [None]:
harare_merged.loc[harare_merged['Cluster Labels'] == 0, harare_merged.columns[[1] + list(range(5, harare_merged.shape[1]))]]

### Examine Cluster 2

In [None]:
harare_merged.loc[harare_merged['Cluster Labels'] == 1, harare_merged.columns[[1] + list(range(5, harare_merged.shape[1]))]]

### Examine Cluster 3

In [None]:
harare_merged.loc[harare_merged['Cluster Labels'] == 2, harare_merged.columns[[1] + list(range(5, harare_merged.shape[1]))]]

### Examine Cluster 4

In [None]:
harare_merged.loc[harare_merged['Cluster Labels'] == 3, harare_merged.columns[[1] + list(range(5, harare_merged.shape[1]))]]

### Examine Cluster 5

In [None]:
harare_merged.loc[harare_merged['Cluster Labels'] == 4, harare_merged.columns[[1] + list(range(5, harare_merged.shape[1]))]]