# Where to Open a New Commercial Center in Shanghai

## 1. Import libraries

In [1]:
# Data processing
import numpy as np
import pandas as pd

# Geographical coordinates
import geocoder #  Get geographical coordinates
from geopy.geocoders import Nominatim # Convert address to lat & lng values

# Web scrapping
import requests
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize

# Plotting
import matplotlib.cm as cm
import matplotlib.colors as colors

# Clustering and mapping
from sklearn.cluster import KMeans
import folium
print("Libraries imported successfully.")

Libraries imported successfully.


## 2. Scrap neighbourhoods and districts of Shanghai from webpage

In [2]:
# Send request and store data into the neighbourhoods list

# Neighbourhood list
neighbors = requests.get("https://en.wikipedia.org/wiki/Category:Neighbourhoods_of_Shanghai").text
soup_neighbors = BeautifulSoup(neighbors, 'html.parser')

neighbourhoods = []
for row in soup_neighbors.find_all("div", class_="mw-category")[0].find_all("li"):
    neighbourhoods.append(row.text)

# District list
districts = requests.get("https://en.wikipedia.org/wiki/Category:Districts_of_Shanghai").text
soup_districts = BeautifulSoup(districts, 'html.parser')

for row in soup_districts.find_all("a", class_="CategoryTreeLabel CategoryTreeLabelNs14 CategoryTreeLabelCategory"):
    neighbourhoods.append(row.text)

for row in soup_districts.find_all("div", id="mw-pages")[0].find_all("div", class_="mw-category")[0].find_all("li"):
    neighbourhoods.append(row.text)

In [3]:
# Create a dataframe
shanghai = pd.DataFrame({"Neighbourhood": neighbourhoods})
shanghai.head()

Unnamed: 0,Neighbourhood
0,Anting
1,Changshou Road Subdistrict
2,Fengjing
3,"Gaoqiao, Shanghai"
4,"Gubei, Shanghai"


In [4]:
# Print dataframe size
shanghai.shape

(49, 1)

In [5]:
# Drop duplicated rows in the dataframe
shanghai = shanghai.drop_duplicates().reset_index(drop=True)
shanghai.shape

(42, 1)

## 3. Get the geographical coordinates

In [6]:
# Define a function to get coordinates
def get_latlng(neighbourhood):

    lat_lng_coord = None
    while(lat_lng_coord is None):
        g = geocoder.arcgis('{}, Shanghai, China'.format(neighbourhood))
        lat_lng_coord = g.latlng
    
    return lat_lng_coord

In [7]:
# Call the function to get the coordinates and save them into a list
coordinates = [get_latlng(neighbourhood) for neighbourhood in shanghai["Neighbourhood"].tolist()]
coordinates[0:5]

[[31.29890000000006, 121.15760000000012],
 [30.916040000000066, 121.15409000000011],
 [31.116700000000037, 121.12902000000008],
 [31.22222000000005, 121.45806000000005],
 [31.22222000000005, 121.45806000000005]]

In [8]:
# Data preparation
temp_df = pd.DataFrame(coordinates, columns=['Latitude', 'Longitude'])
shanghai['Latitude'] = temp_df['Latitude']
shanghai['Longitude'] = temp_df['Longitude']
shanghai = shanghai[shanghai.Neighbourhood != "List of administrative divisions of Shanghai"]
shanghai

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,Anting,31.2989,121.1576
1,Changshou Road Subdistrict,30.91604,121.15409
2,Fengjing,31.1167,121.12902
3,"Gaoqiao, Shanghai",31.22222,121.45806
4,"Gubei, Shanghai",31.22222,121.45806
5,"Koreatown, Shanghai",31.22222,121.45806
6,Lujiazui,30.79141,121.34888
7,"Luodian, Shanghai",31.22222,121.45806
8,Nanxiang,31.29979,121.3118
9,Qiantan International Business Zone (Shanghai),31.22222,121.45806


In [9]:
# Save the dataframe to CSV file
shanghai.to_csv("shanghai.csv", index=False)

## 4. Create a map of Shanghai neighbourhoods

In [10]:
# Get the geographical coordinates of Shanghai
from geopy.geocoders import Nominatim # Convert an address into latitude and longitude

address = "Shanghai, China"

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Shanghai are {}, {}'.format(latitude, longitude))

  


The geographical coordinates of Shanghai are 31.2252985, 121.4890497


In [11]:
# Create the map of Shanghai with neighbourhoods superposed on top

import folium

# create map of Shanghai using latitude and longitude values
map_shanghai = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighbourhood in zip(shanghai['Latitude'], shanghai['Longitude'], shanghai['Neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='#ff0000',
        fill=True,
        fill_color='#ff0000',
        fill_opacity=0.7
        ).add_to(map_shanghai)  
    
map_shanghai

In [12]:
# Save the map as html file
map_shanghai.save("map_shanghai.html")

## 5. Use the Foursquare API to explore the neighbourhoods

In [13]:
# Define Foursquare api and the version
CLIENT_ID = "4CYB4RGF43RKBY3VAGIASQAXRXKYSWJVLTY4FZMC3OGARIPT"
CLIENT_SECRET = "JUDFSEX3X04GXAR2PRKRYL3SQNDA0JBYJSXYU3XCW0SPR2E0"
VERSION = "20190706"

# Get top 100 venues within a radius of 2000 meters 
LIMIT = 100
RADIUS = 2000

In [14]:
# Define a function to get venues via Foursquare
def getNearbyVenues(names, latitudes, longitudes):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # Create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID,
                CLIENT_SECRET,
                VERSION,
                lat,
                lng,
                RADIUS,
                LIMIT)
        
        results = requests.get(url).json()["response"]["groups"][0]['items']
        
        # Return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results
        ])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [15]:
venue_shanghai = getNearbyVenues(names = shanghai["Neighbourhood"],
                                latitudes = shanghai["Latitude"],
                                longitudes = shanghai["Longitude"])

Anting
Changshou Road Subdistrict
Fengjing
Gaoqiao, Shanghai
Gubei, Shanghai
Koreatown, Shanghai
Lujiazui
Luodian, Shanghai
Nanxiang
Qiantan International Business Zone (Shanghai)
Qibao
Songjiang Town
Tianzifang
Wusong
Xintiandi
Xinzhuang, Shanghai
Xujiahui
Zhangjiang Town
Zhujiajiao
Baoshan District, Shanghai
Changning District
Chongming District
Hongkou District
Huangpu District, Shanghai
Jing'an District
Minhang District
Pudong
Putuo District, Shanghai
Qingpu District
Xuhui District
Yangpu District
Chuansha County
Fengxian District
Jiading District
Jinshan District
Luwan District
Nanhui District
Nanshi District, Shanghai
Shanghai County
Songjiang District
Zhabei


In [16]:
venue_shanghai.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Anting,31.2989,121.1576,Alibaba,31.297209,121.162602,German Restaurant
1,Anting,31.2989,121.1576,Wirtshaus,31.291667,121.154532,Bar
2,Anting,31.2989,121.1576,Starbucks (星巴克),31.289777,121.157733,Coffee Shop
3,Anting,31.2989,121.1576,Life Hub (嘉亭荟城市生活广场),31.289792,121.157673,Shopping Mall
4,Anting,31.2989,121.1576,KFC (肯德基),31.297443,121.158709,Fast Food Restaurant


In [17]:
venue_shanghai.shape

(1982, 7)

### Find out how many unique categories can be curated from all the returned venues

In [18]:
print("There are {} unique categories".format(
    len(venue_shanghai["Venue Category"].unique())
))

There are 168 unique categories


In [19]:
# Check if Shopping Mall is included in the venue categories
"Shopping Mall" in venue_shanghai["Venue Category"].unique()

True

In [20]:
# Check if Movie Theater is included in the venue categories
"Movie Theater" in venue_shanghai["Venue Category"].unique()

True

In [21]:
# Check if Restaurant is included in the venue categories
"Restaurant" in venue_shanghai["Venue Category"].unique()

True

## 6. Analyzing each neighbourhood

In [22]:
# One hot encoding
venue_shanghai_onehot = pd.get_dummies(venue_shanghai[["Venue Category"]], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe
venue_shanghai_onehot['Neighbourhood'] = venue_shanghai["Neighbourhood"] 

# Move neighborhood column to the first column
nbh = venue_shanghai_onehot['Neighbourhood']
venue_shanghai_onehot.drop(labels=['Neighbourhood'], axis=1,inplace=True)
venue_shanghai_onehot.insert(0, 'Neighbourhood', nbh)

venue_shanghai_onehot.head()

Unnamed: 0,Neighbourhood,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Waterfront,Whisky Bar,Wine Bar,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Anting,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Anting,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Anting,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Anting,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Anting,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Group rows by neighbourhood and by taking the mean of the frequency of occurency of each category
venue_shanghai_grouped = venue_shanghai_onehot.groupby("Neighbourhood").mean().reset_index()
venue_shanghai_grouped

Unnamed: 0,Neighbourhood,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Waterfront,Whisky Bar,Wine Bar,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Anting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Baoshan District, Shanghai",0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0
2,Changning District,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.01,...,0.02,0.0,0.0,0.0,0.0,0.03,0.01,0.01,0.02,0.0
3,Changshou Road Subdistrict,0.0,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Chongming District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Chuansha County,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Fengjing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Fengxian District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Gaoqiao, Shanghai",0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0
9,"Gubei, Shanghai",0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.05,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0


In [24]:
len(venue_shanghai_grouped[venue_shanghai_grouped["Shopping Mall"]>0])

25

### Create a dataframe for Shopping Mall only

In [25]:
shopping_mall_shanghai = venue_shanghai_grouped[["Neighbourhood", "Shopping Mall"]].reset_index(drop=True)
shopping_mall_shanghai.shape

(39, 2)

## 7. Clustering of neighbourhoods in Shanghai

### Run K-Means to cluster the neighbourhoods into 3 clusters

In [26]:
# set number of clusters
kclusters = 3

shopping_mall_clustered = shopping_mall_shanghai.drop(["Neighbourhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(shopping_mall_clustered)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 1, 2, 0, 1,
       0, 0, 0, 1, 2, 1, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2])

In [27]:
# Create a new dataframe that includes the cluster
shopping_mall_merged = shopping_mall_shanghai.copy()

# Add clustering labels
shopping_mall_merged["Cluster Labels"] = kmeans.labels_

# Add latitudes and longitudes for each neighbourhood
shopping_mall_merged = shopping_mall_merged.join(shanghai.set_index("Neighbourhood"), on = "Neighbourhood")

shopping_mall_merged

Unnamed: 0,Neighbourhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,Anting,0.052632,0,31.2989,121.1576
1,"Baoshan District, Shanghai",0.04,0,31.22222,121.45806
2,Changning District,0.01,2,31.21739,121.42105
3,Changshou Road Subdistrict,0.0,2,30.91604,121.15409
4,Chongming District,0.0,2,31.63318,121.46795
5,Chuansha County,0.0,2,31.47713,121.36461
6,Fengjing,0.0,2,31.1167,121.12902
7,Fengxian District,0.0,2,30.83381,121.52128
8,"Gaoqiao, Shanghai",0.04,0,31.22222,121.45806
9,"Gubei, Shanghai",0.04,0,31.22222,121.45806


In [28]:
# Sort the results by Cluster Labels
print(shopping_mall_merged.shape)
shopping_mall_merged.sort_values(["Cluster Labels"], inplace=True)
shopping_mall_merged

(39, 5)


Unnamed: 0,Neighbourhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,Anting,0.052632,0,31.2989,121.1576
24,Qiantan International Business Zone (Shanghai),0.04,0,31.22222,121.45806
23,"Putuo District, Shanghai",0.04,0,31.22222,121.45806
22,Pudong,0.055556,0,31.23513,121.52759
20,"Nanshi District, Shanghai",0.04,0,31.22222,121.45806
37,Zhangjiang Town,0.060606,0,31.20861,121.60889
16,"Luodian, Shanghai",0.04,0,31.22222,121.45806
14,"Koreatown, Shanghai",0.04,0,31.22222,121.45806
30,Tianzifang,0.04,0,31.22222,121.45806
11,"Huangpu District, Shanghai",0.04,0,31.22222,121.45806


### Visualize the 3 clusters on map

In [29]:
# Create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(shopping_mall_merged['Latitude'],
                                  shopping_mall_merged['Longitude'],
                                  shopping_mall_merged['Neighbourhood'],
                                  shopping_mall_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [30]:
# Save the map as html file
map_clusters.save("map_clusters.html")

## 8. Analyze the clusters

### Cluster 0

In [31]:
shopping_mall_merged.loc[shopping_mall_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighbourhood,Shopping Mall,Cluster Labels,Latitude,Longitude
0,Anting,0.052632,0,31.2989,121.1576
24,Qiantan International Business Zone (Shanghai),0.04,0,31.22222,121.45806
23,"Putuo District, Shanghai",0.04,0,31.22222,121.45806
22,Pudong,0.055556,0,31.23513,121.52759
20,"Nanshi District, Shanghai",0.04,0,31.22222,121.45806
37,Zhangjiang Town,0.060606,0,31.20861,121.60889
16,"Luodian, Shanghai",0.04,0,31.22222,121.45806
14,"Koreatown, Shanghai",0.04,0,31.22222,121.45806
30,Tianzifang,0.04,0,31.22222,121.45806
11,"Huangpu District, Shanghai",0.04,0,31.22222,121.45806


### Cluster 1

In [32]:
shopping_mall_merged.loc[shopping_mall_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighbourhood,Shopping Mall,Cluster Labels,Latitude,Longitude
25,Qibao,0.136364,1,31.15267,121.35688
18,Minhang District,0.105263,1,31.1088,121.37472
21,Nanxiang,0.230769,1,31.29979,121.3118
27,Shanghai County,0.125,1,31.10027,121.38565


### Cluster 2

In [33]:
shopping_mall_merged.loc[shopping_mall_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighbourhood,Shopping Mall,Cluster Labels,Latitude,Longitude
31,Wusong,0.0,2,31.37566,121.49041
29,Songjiang Town,0.0,2,31.03595,121.2146
32,Xintiandi,0.0,2,31.02474,121.6788
28,Songjiang District,0.0,2,31.03595,121.2146
34,Xujiahui,0.02,2,31.19,121.43194
19,Nanhui District,0.0,2,31.03188,121.75906
17,Luwan District,0.01,2,31.20908,121.46335
15,Lujiazui,0.0,2,30.79141,121.34888
12,Jiading District,0.0,2,31.36637,121.22153
7,Fengxian District,0.0,2,30.83381,121.52128


### Conclusion

It is obvious that __most of the commercial centers are located in the city center, with the cluster 0__.

However, __shopping malls in cluster 1 have intense competition__ due to too many commercial units and high concentration of shopping malls.

On the other hand, __cluster 2 has no shopping mall in the neighborhoods__, which means a good opportunity and high business potential to open new commercial centers because there is little competition from existing markets.

From another perspective, this also shows that the oversupply of shopping malls mostly happened in the city center, while the suburban area still have very few shopping malls.

Therefore, this project would recommend property investors to capitalize on these findings to open new shopping malls in neighborhoods of cluster 2 with rare competition, __especially the 3 locations of cluster 2 which are also in the city center: Luwan District, Changning District and Xujiahui__.

Furthermore, property investors with unique selling propositions to stand out from the competition can also open new shopping malls in neighborhoods in cluster 0 with moderate competition.

In the end, property investors are advised to avoid neighborhoods in cluster 1 which already have high concentration of shopping malls and suffering from intense competition.