In [55]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [72]:
url = 'https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore'
result = requests.get(url)
doc = BeautifulSoup(result.text, 'html.parser')


In [73]:
def convert_mall(list_item):
    for link in list_item.find_all('a'):
        if link['href'].startswith("/wiki"):
            return (list_item.text, link['href'])
    return (list_item.text,"")

In [74]:
mall_links = [convert_mall(list_item) 
            for div_section in doc.find_all('div', class_='div-col') 
            for list_item in div_section.find_all('li')
        ]
mall_links

        

[('100 AM[1]', ''),
 ('313@Somerset[2]', ''),
 ('Aperia', ''),
 ('Balestier Hill Shopping Centre', ''),
 ('Bugis Cube[3]', ''),
 ('Bugis Junction', '/wiki/Bugis_Junction'),
 ('Bugis+', '/wiki/Bugis%2B'),
 ('Capitol Piazza', '/wiki/Capitol_Piazza'),
 ('Cathay Cineleisure Orchard', '/wiki/Cathay_Cineleisure_Orchard'),
 ('Clarke Quay Central', ''),
 ('The Centrepoint', '/wiki/The_Centrepoint'),
 ('City Square Mall', '/wiki/City_Square_Mall_(Singapore)'),
 ('City Gate Mall[4]', ''),
 ('CityLink Mall', '/wiki/CityLink_Mall'),
 ('Duo', '/wiki/DUO'),
 ('Far East Plaza', '/wiki/Far_East_Plaza'),
 ('Funan', '/wiki/Funan,_Singapore'),
 ('Great World City', '/wiki/Great_World_City'),
 ('HDB Hub', '/wiki/HDB_Hub'),
 ('Holland Village Shopping Mall', '/wiki/Holland_Village,_Singapore'),
 ('ION Orchard', '/wiki/ION_Orchard'),
 ('Junction 8', '/wiki/Junction_8_Shopping_Centre'),
 ('Knightsbridge[5]', ''),
 ('Liat Towers', '/wiki/Liat_Towers'),
 ('Lucky Plaza', '/wiki/Lucky_Plaza'),
 ('Marina Bay Sand

In [75]:
df = pd.DataFrame(mall_links, columns=['Name', 'Link'])
df.head(10)

Unnamed: 0,Name,Link
0,100 AM[1],
1,313@Somerset[2],
2,Aperia,
3,Balestier Hill Shopping Centre,
4,Bugis Cube[3],
5,Bugis Junction,/wiki/Bugis_Junction
6,Bugis+,/wiki/Bugis%2B
7,Capitol Piazza,/wiki/Capitol_Piazza
8,Cathay Cineleisure Orchard,/wiki/Cathay_Cineleisure_Orchard
9,Clarke Quay Central,


#### Add Opening Date
Retrieve opening information from mall's wiki page, if available

In [76]:
def get_opening_date(url_ext):
    url = f"https://en.wikipedia.org{url_ext}"
    result = requests.get(url)
    doc = BeautifulSoup(result.text, 'html.parser')

    # get the opening date
    opened_row = doc.find('th', string='Opened') or doc.find('th', string='Opening date')
    if opened_row:
        # Find the next sibling of the 'opened_row' which is the 'td' containing the date
        opened_date_cell = opened_row.find_next_sibling('td')
        return opened_date_cell.text or ''
    else:
        return ''

In [77]:
df['Opening Date'] = df['Link'].apply(get_opening_date)
df.head(10)

Unnamed: 0,Name,Link,Opening Date
0,100 AM[1],,
1,313@Somerset[2],,
2,Aperia,,
3,Balestier Hill Shopping Centre,,
4,Bugis Cube[3],,
5,Bugis Junction,/wiki/Bugis_Junction,"September 8, 1995; 28 years ago (1995-09-08)"
6,Bugis+,/wiki/Bugis%2B,1 June 2009 (as Iluma)
7,Capitol Piazza,/wiki/Capitol_Piazza,March 2015
8,Cathay Cineleisure Orchard,/wiki/Cathay_Cineleisure_Orchard,1997
9,Clarke Quay Central,,


#### Add Geocoordinates
Use OneMap SG API to retrieve coordinate information, if available

In [78]:
def get_geocoordinates_from_address(address: str) -> tuple:
    url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
    
    response = requests.get(url)    
    data = response.json()

    # Check if there are results
    if data['results']:
        first_result = data['results'][0]
        latitude = first_result['LATITUDE']
        longitude = first_result['LONGITUDE']
        return (latitude, longitude)
    else:
        return ("", "")

In [79]:
df[['Latitude', 'Longitude']] = df['Name'].apply(lambda address: pd.Series(get_geocoordinates_from_address(address)))
df.head(10)

Unnamed: 0,Name,Link,Opening Date,Latitude,Longitude
0,100 AM[1],,,1.28155949555229,103.847208361003
1,313@Somerset[2],,,1.30101436404056,103.838360664485
2,Aperia,,,1.3097112065077,103.864326436447
3,Balestier Hill Shopping Centre,,,1.32559594839311,103.842571612968
4,Bugis Cube[3],,,,
5,Bugis Junction,/wiki/Bugis_Junction,"September 8, 1995; 28 years ago (1995-09-08)",1.2991371723215,103.855450325604
6,Bugis+,/wiki/Bugis%2B,1 June 2009 (as Iluma),1.30095171530648,103.855172625542
7,Capitol Piazza,/wiki/Capitol_Piazza,March 2015,1.29307884763132,103.851261982149
8,Cathay Cineleisure Orchard,/wiki/Cathay_Cineleisure_Orchard,1997,1.30149264852924,103.836406753067
9,Clarke Quay Central,,,,


In [80]:
df.describe()

Unnamed: 0,Name,Link,Opening Date,Latitude,Longitude
count,169,169.0,169.0,169.0,169.0
unique,168,91.0,69.0,143.0,143.0
top,Junction 8,,,,
freq,2,79.0,96.0,25.0,25.0


#### Save to CSV

In [81]:
df.to_csv('../data/modified/malls_dataset_v1.csv')

#### Conduct EDA + Data Cleaning
Check missing data and manually add information
- Unable to find the coordinates of 17 malls via the API. They either do not exist or have been closed down.

Of the above, the followings malls have been closed:
- The Verge
- City Vibe
- JCube
- Jurong Entertainment Centre
- Ellenborough Market
- Capitol Centre
- Amber Mansions
- Serangoon Plaza
- Specialist Shopping Centre

The coordinates of the following malls are manually added:
- Clarke Quay Central
- Scotts Shopping Centre -> Scotts Square
- Shaw House and Centre -> Shaw House
- Mandarin Gallery
- Cosford Container Park
- Change Alley


In [91]:
raw_df = pd.read_csv('../data/modified/malls_dataset_v1.csv')
raw_df = raw_df.drop(columns=['Unnamed: 0', 'Link'])
raw_df.head(10)

Unnamed: 0,Name,Opening Date,Latitude,Longitude
0,100 AM[1],,1.281559,103.847208
1,313@Somerset[2],,1.301014,103.838361
2,Aperia,,1.309711,103.864326
3,Balestier Hill Shopping Centre,,1.325596,103.842572
4,Bugis Cube[3],,,
5,Bugis Junction,"September 8, 1995; 28 years ago (1995-09-08)",1.299137,103.85545
6,Bugis+,1 June 2009 (as Iluma),1.300952,103.855173
7,Capitol Piazza,March 2015,1.293079,103.851262
8,Cathay Cineleisure Orchard,1997,1.301493,103.836407
9,Clarke Quay Central,,,


In [92]:
malls_no_coordinates = raw_df[raw_df['Latitude'].isna()]
malls_no_coordinates.head(5)

Unnamed: 0,Name,Opening Date,Latitude,Longitude
4,Bugis Cube[3],,,
9,Clarke Quay Central,,,
12,City Gate Mall[4],,,
19,Holland Village Shopping Mall,,,
22,Knightsbridge[5],,,


In [93]:
malls_no_opening_dates = raw_df[raw_df['Opening Date'].isna()]
malls_no_opening_dates.head(5)

Unnamed: 0,Name,Opening Date,Latitude,Longitude
0,100 AM[1],,1.281559,103.847208
1,313@Somerset[2],,1.301014,103.838361
2,Aperia,,1.309711,103.864326
3,Balestier Hill Shopping Centre,,1.325596,103.842572
4,Bugis Cube[3],,,
