# Clustering cities depending on their most common features

## 1. Imports and such things

Use pandas to handle dataframes

In [5]:
import pandas as pd

Use Scikit-learn to do clustering

In [19]:
import numpy as np
from sklearn.cluster import KMeans

## 2. Import dataframes from notebook 1

This dataframe contains Cities, with their nation, population, Wikipedia URL, Latitude and Longitude

In [7]:
df_cities = pd.read_pickle('cities.pickle')
df_cities.head()

Unnamed: 0,City,Nation,Population,URL,Latitude,Longitude
0,Chongqing,China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667
1,Shanghai,China,24256800,https://en.wikipedia.org/wiki/Shanghai,31.228611,121.474722
2,Delhi,India,11034555,https://en.wikipedia.org/wiki/Delhi,28.61,77.23
3,Beijing,China,21516000,https://en.wikipedia.org/wiki/Beijing,39.916667,116.383333
4,Dhaka,Bangladesh,14399000,https://en.wikipedia.org/wiki/Dhaka,23.716111,90.396111


This dataframe contains a long list of the top pick venues according to Foursquare in those cities 

In [8]:
df_venues = pd.read_pickle('venues.pickle')
df_venues.head()

Unnamed: 0,City,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Chongqing,The Harp Irish Pub,29.565116,106.57531,Bar
1,Chongqing,重庆八一路好吃街,29.558669,106.573631,Chinese Restaurant
2,Chongqing,Paulaner Brauhaus,29.538571,106.557791,German Restaurant
3,Chongqing,重庆Muse酒吧,29.556987,106.571565,Nightclub
4,Chongqing,TT酒吧,29.557011,106.571404,Nightclub


This dataframe contains a left join of the previous two

In [9]:
df_joined = pd.read_pickle('joined.pickle')
df_joined.head()

Unnamed: 0,City,Venue,Venue Latitude,Venue Longitude,Venue Category,Nation,Population,URL,Latitude,Longitude
0,Chongqing,The Harp Irish Pub,29.565116,106.57531,Bar,China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667
1,Chongqing,重庆八一路好吃街,29.558669,106.573631,Chinese Restaurant,China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667
2,Chongqing,Paulaner Brauhaus,29.538571,106.557791,German Restaurant,China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667
3,Chongqing,重庆Muse酒吧,29.556987,106.571565,Nightclub,China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667
4,Chongqing,TT酒吧,29.557011,106.571404,Nightclub,China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667


## 3. Finding the most recommended venues categories

In [15]:
# one hot encoding of venue category
df_venues_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")

# add city column back to the dataframe
df_venues_onehot['City'] = df_venues['City']

# move city column to the first column
fixed_columns = [df_venues_onehot.columns[-1]] + list(df_venues_onehot.columns[:-1])
df_venues_onehot = df_venues_onehot[fixed_columns]

df_venues_onehot.head()

Unnamed: 0,City,ATM,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arepa Restaurant,Argentinian Restaurant,Art Gallery,...,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yakitori Restaurant,Yoga Studio,Yunnan Restaurant,Zoo Exhibit
0,Chongqing,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Chongqing,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chongqing,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Chongqing,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Chongqing,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We can now group by city

In [17]:
df_venues_grouped = df_venues_onehot.groupby('City').mean().reset_index()

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [36]:
num_top_venues = 10

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    columns.append('Common Venue {}'.format(ind+1))

# create a new dataframe
df_common_venues = pd.DataFrame(columns=columns)
df_common_venues['City'] = df_venues_grouped['City']

for ind in np.arange(df_venues_grouped.shape[0]):
    df_common_venues.iloc[ind, 1:] = return_most_common_venues(df_venues_grouped.iloc[ind, :], num_top_venues)

df_common_venues.head()

Unnamed: 0,City,Common Venue 1,Common Venue 2,Common Venue 3,Common Venue 4,Common Venue 5,Common Venue 6,Common Venue 7,Common Venue 8,Common Venue 9,Common Venue 10
0,Abidjan,African Restaurant,Shopping Mall,Nightclub,Multiplex,Fast Food Restaurant,Bar,Music Venue,Beach Bar,Golf Course,Vietnamese Restaurant
1,Abu Dhabi,Coffee Shop,Pub,Frozen Yogurt Shop,Music Store,Café,Mongolian Restaurant,Shopping Mall,Park,Turkish Restaurant,Clothing Store
2,Abuja,Bar,Restaurant,Café,Movie Theater,Bistro,Shopping Mall,Garden,Park,Frozen Yogurt Shop,Indian Restaurant
3,Accra,Bar,Restaurant,Sports Bar,Pool,Nightclub,Snack Place,Lounge,Hotel,Music Venue,Jazz Club
4,Addis Ababa,Italian Restaurant,Hotel,Coffee Shop,Burger Joint,Café,Greek Restaurant,Grocery Store,Middle Eastern Restaurant,Massage Studio,Brewery


## 4. Finding similar cities

In [37]:
# set number of clusters
kclusters = 5

df_clustering = df_venues_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)

In [38]:
df_common_venues['Cluster label'] = kmeans.labels_

df_cities2 = pd.merge(df_common_venues, df_cities, how='left',
        on='City', validate="1:1")
df_cities2.head()

MergeError: Merge keys are not unique in right dataset; not a one-to-one merge

In [39]:
df_cities

Unnamed: 0,City,Nation,Population,URL,Latitude,Longitude
0,Chongqing,China,30751600,https://en.wikipedia.org/wiki/Chongqing,29.558333,106.566667
1,Shanghai,China,24256800,https://en.wikipedia.org/wiki/Shanghai,31.228611,121.474722
2,Delhi,India,11034555,https://en.wikipedia.org/wiki/Delhi,28.610000,77.230000
3,Beijing,China,21516000,https://en.wikipedia.org/wiki/Beijing,39.916667,116.383333
4,Dhaka,Bangladesh,14399000,https://en.wikipedia.org/wiki/Dhaka,23.716111,90.396111
5,Mumbai,India,12478447,https://en.wikipedia.org/wiki/Mumbai,18.975000,72.825833
6,Lagos,Nigeria,16060303,https://en.wikipedia.org/wiki/Lagos,6.455027,3.384082
7,Chengdu,China,16044700,https://en.wikipedia.org/wiki/Chengdu,30.657000,104.066000
8,Karachi,Pakistan,14910352,https://en.wikipedia.org/wiki/Karachi,24.860000,67.010000
9,Guangzhou,China,14043500,https://en.wikipedia.org/wiki/Guangzhou,23.132000,113.266000


In [40]:
df_common_venues

Unnamed: 0,City,Common Venue 1,Common Venue 2,Common Venue 3,Common Venue 4,Common Venue 5,Common Venue 6,Common Venue 7,Common Venue 8,Common Venue 9,Common Venue 10,Cluster label
0,Abidjan,African Restaurant,Shopping Mall,Nightclub,Multiplex,Fast Food Restaurant,Bar,Music Venue,Beach Bar,Golf Course,Vietnamese Restaurant,1
1,Abu Dhabi,Coffee Shop,Pub,Frozen Yogurt Shop,Music Store,Café,Mongolian Restaurant,Shopping Mall,Park,Turkish Restaurant,Clothing Store,0
2,Abuja,Bar,Restaurant,Café,Movie Theater,Bistro,Shopping Mall,Garden,Park,Frozen Yogurt Shop,Indian Restaurant,1
3,Accra,Bar,Restaurant,Sports Bar,Pool,Nightclub,Snack Place,Lounge,Hotel,Music Venue,Jazz Club,2
4,Addis Ababa,Italian Restaurant,Hotel,Coffee Shop,Burger Joint,Café,Greek Restaurant,Grocery Store,Middle Eastern Restaurant,Massage Studio,Brewery,2
5,Ahmedabad,Indian Restaurant,Tea Room,Snack Place,Bookstore,Café,Mexican Restaurant,Fast Food Restaurant,Coffee Shop,Farmers Market,Hotel,1
6,Ahvaz,Ice Cream Shop,Restaurant,Park,Market,Plaza,Falafel Restaurant,Pizza Place,Bookstore,River,Shopping Mall,2
7,Alexandria,Coffee Shop,Sports Club,Art Gallery,Sandwich Place,Café,Clothing Store,Opera House,Restaurant,Waterfront,Sushi Restaurant,0
8,Algiers,French Restaurant,Diner,Burger Joint,History Museum,Plaza,Indian Restaurant,Lounge,Art Museum,Park,Café,2
9,Allahabad,Café,Shopping Mall,Restaurant,River,Multiplex,Clothing Store,Flea Market,Zoo Exhibit,Farmers Market,Fast Food Restaurant,3


In [46]:
df_venues.City.unique().size

246