# 1. Import Libraries

In [40]:
import warnings
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

warnings.filterwarnings('ignore')

# 2. Load Data

In [2]:
# Specify file paths
location_file = 'https://raw.githubusercontent.com/prattapong/DADS5001/main/Homework%203/data/tambon.csv'
population_file = 'https://raw.githubusercontent.com/prattapong/DADS5001/main/Homework%203/data/bangkok_population.csv'

# Load as DataFrame
df_location = pd.read_csv(location_file)
df_population = pd.read_csv(population_file)

In [3]:
# Print shape and display first 5 rows
print(f'Location DataFrame Shape: {df_location.shape}')
display(df_location.head())
print(f'Bangkok Population DataFrame Shape: {df_population.shape}')
display(df_population.head())

Location DataFrame Shape: (7769, 12)


Unnamed: 0,AD_LEVEL,TA_ID,TAMBON_T,TAMBON_E,AM_ID,AMPHOE_T,AMPHOE_E,CH_ID,CHANGWAT_T,CHANGWAT_E,LAT,LONG
0,4.0,910106.0,ต. เกาะสาหร่าย,Ko Sarai,9101.0,,Mueang Satun,91.0,จ. สตูล,Satun,6.546,99.706
1,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.646,101.171
2,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.645,101.17
3,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.649,101.174
4,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.644,101.169


Bangkok Population DataFrame Shape: (54, 4)


Unnamed: 0,พื้นที่,ชาย,หญิง,รวม
0,ยอดรวมทั้งหมด,2554230,2915202,5469432
1,ท้องถิ่นเขตพระนคร,19381,20754,40135
2,ท้องถิ่นเขตดุสิต,40325,35510,75835
3,ท้องถิ่นเขตหนองจอก,89451,94899,184350
4,ท้องถิ่นเขตบางรัก,20064,22759,42823


# 3. Data Cleansing

## 3.1 Clean Location Data

In [4]:
# Check unique CHANGWAT_E
df_location['CHANGWAT_E'].unique()

array(['Satun', 'Rayong', 'Chumphon', 'Maha Sarakham', 'Surat Thani',
       'Chon Buri', 'Trang', 'Chanthaburi', 'Phang-nga', 'Trat', 'Ranong',
       'Phuket', 'Krabi', 'Bangkok', 'Nakhon Si Thammarat',
       'Samut Sakhon', 'Phra Nakhon Si Ayutthaya', 'Lop Buri',
       'Samut Prakarn', 'Ang Thong', 'Pattani', 'Chiang Mai', 'Chai Nat',
       'Nonthaburi', 'Uttaradit', 'Khon Kaen', 'Nakhon Sawan',
       'Phetchaburi', 'Sing Buri', 'Samut Songkhram', 'Ratchaburi',
       'Nakhon Nayok', 'Saraburi', 'Nakhon Pathom', 'Kanchanaburi',
       'Pathum Thani', 'Phitsanulok', 'Chachoengsao', 'Chiang Rai',
       'Songkhla', 'Lamphun', 'Nong Khai', 'Ubon Ratchathani',
       'Sukhothai', 'Phayao', 'Uthai Thani', 'Phrae', 'Phetchabun',
       'Mae Hong Son', 'Buri Ram', 'Phichit', 'Nan', 'Yala',
       'Suphan Buri', 'Nakhon Ratchasima', 'Prachin Buri', 'Tak',
       'Roi Et', 'Nakhon Phanom', 'Mukdahan', 'Sa kaeo', 'Udon Thani',
       'Lampang', 'Si Sa Ket', 'Yasothon', 'Phatthalung', 'Nar

In [5]:
# There is nan in the list, therefore, we will check for these rows
df_location[df_location['CHANGWAT_E'].isna()]

Unnamed: 0,AD_LEVEL,TA_ID,TAMBON_T,TAMBON_E,AM_ID,AMPHOE_T,AMPHOE_E,CH_ID,CHANGWAT_T,CHANGWAT_E,LAT,LONG
7768,,,,,,,,,,,,


In [6]:
# Since it has NaN in the entire row, we will drop this row
df_location.dropna(subset = ['CHANGWAT_E'],
                   axis = 0,
                   inplace = True)
print(df_location.shape)

(7768, 12)


In [7]:
# Print CHANGWAT_E and its total number
print(sorted(df_location['CHANGWAT_E'].unique()))
print(len(df_location['CHANGWAT_E'].unique()))

['Amnat Charoen', 'Ang Thong', 'Bangkok', 'Bueng Kan', 'Buri Ram', 'Chachoengsao', 'Chai Nat', 'Chaiyaphum', 'Chanthaburi', 'Chiang Mai', 'Chiang Rai', 'Chon Buri', 'Chumphon', 'Kalasin', 'Kamphaeng Phet', 'Kanchanaburi', 'Khon Kaen', 'Krabi', 'Lampang', 'Lamphun', 'Loei', 'Lop Buri', 'Mae Hong Son', 'Maha Sarakham', 'Mukdahan', 'Nakhon Nayok', 'Nakhon Pathom', 'Nakhon Phanom', 'Nakhon Ratchasima', 'Nakhon Sawan', 'Nakhon Si Thammarat', 'Nan', 'Narathiwat', 'Nong Bua Lam Phu', 'Nong Khai', 'Nonthaburi', 'Pathum Thani', 'Pattani', 'Phang-nga', 'Phatthalung', 'Phayao', 'Phetchabun', 'Phetchaburi', 'Phichit', 'Phitsanulok', 'Phra Nakhon Si Ayutthaya', 'Phrae', 'Phuket', 'Prachin Buri', 'Prachuap Khiri Khan', 'Ranong', 'Ratchaburi', 'Rayong', 'Roi Et', 'Sa kaeo', 'Sakon Nakhon', 'Samut Prakarn', 'Samut Sakhon', 'Samut Songkhram', 'Saraburi', 'Satun', 'Si Sa Ket', 'Sing Buri', 'Songkhla', 'Sukhothai', 'Suphan Buri', 'Surat Thani', 'Surin', 'Tak', 'Trang', 'Trat', 'Ubon Ratchathani', 'Udon T

In [8]:
# Slice DataFrame to only Bangkok
df_bangkok = df_location[df_location['CHANGWAT_E'] == 'Bangkok']
print(f'Bangkok Location DataFrame Shape: {df_bangkok.shape}')
display(df_bangkok.head())

Bangkok Location DataFrame Shape: (154, 12)


Unnamed: 0,AD_LEVEL,TA_ID,TAMBON_T,TAMBON_E,AM_ID,AMPHOE_T,AMPHOE_E,CH_ID,CHANGWAT_T,CHANGWAT_E,LAT,LONG
199,4.0,100106.0,แขวง เสาชิงช้า,Sao Chingcha,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.753,100.5
202,4.0,100105.0,แขวง ศาลเจ้าพ่อเสือ,San Chaopho Suea,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.754,100.497
214,4.0,100108.0,แขวง ตลาดยอด,Talat Yot,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.76,100.498
224,4.0,100103.0,แขวง วัดราชบพิธ,Wat Ratchabophit,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.75,100.499
243,4.0,100104.0,แขวง สำราญราษฎร์,Samran Rat,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.751,100.503


In [9]:
# Get mean latitude and longitude of each district
df_latlong = df_bangkok.groupby('AMPHOE_T', as_index = False)[['LAT','LONG']].mean()

# Replace the word "เขต" in the column AMPHOE_T and trim
df_latlong['AMPHOE_T'] = df_latlong['AMPHOE_T'].str.replace('เขต', '').str.strip()

# Rename columns
df_latlong.columns = ['District', 'lat', 'long']
df_latlong.head()

Unnamed: 0,District,lat,long
0,คลองสาน,13.7265,100.5025
1,คลองสามวา,13.8678,100.7398
2,คลองเตย,13.713333,100.578667
3,คันนายาว,13.821,100.677
4,จตุจักร,13.826,100.565


In [10]:
# Check Dtypes
df_latlong.dtypes

District     object
lat         float64
long        float64
dtype: object

## 3.2 Clean Population Data

In [11]:
# Since column names of df_population are in Thai, we will rename the column names first
df_population.columns = ['District', 'Male', 'Female', 'Total']

In [12]:
# Check missing values
print(df_population.isna().sum())
display(df_population[df_population.isna().any(axis = 1)])

District    3
Male        3
Female      3
Total       3
dtype: int64


Unnamed: 0,District,Male,Female,Total
51,,,,
52,,,,
53,,,,


In [13]:
# Drop NaN
df_population.dropna(axis = 0, inplace = True)

In [14]:
# Get only district and total column
df_population = df_population[['District', 'Total']]

In [15]:
# Remove the word "ท้องถิ่นเขต" from District column and trim
df_population['District'] = df_population['District'].str.replace('ท้องถิ่นเขต', '').str.strip()

In [16]:
# Remove total row
df_population = df_population[~df_population['District'].str.contains('ยอดรวม')]

In [17]:
# Check Dtypes
df_population.dtypes

District    object
Total       object
dtype: object

In [18]:
# Cast Total column as integer
df_population['Total'] = df_population['Total'].str.replace(',', '').astype('int')
df_population.dtypes

District    object
Total        int32
dtype: object

## 3.3 Merge DataFrames

In [19]:
# Check the format of both DataFrame wheter they match to merge or not
display(df_population.sort_values(by = 'District').head())
display(df_latlong.sort_values(by = 'District').head())

Unnamed: 0,District,Total
18,คลองสาน,64805
46,คลองสามวา,212131
33,คลองเตย,89398
43,คันนายาว,95069
30,จตุจักร,153969


Unnamed: 0,District,lat,long
0,คลองสาน,13.7265,100.5025
1,คลองสามวา,13.8678,100.7398
2,คลองเตย,13.713333,100.578667
3,คันนายาว,13.821,100.677
4,จตุจักร,13.826,100.565


In [20]:
df_merge = df_population.merge(right = df_latlong,
                               how = 'left',
                               on = 'District')
print(df_merge.isna().sum())
display(df_merge.head())

District    0
Total       0
lat         0
long        0
dtype: int64


Unnamed: 0,District,Total,lat,long
0,พระนคร,40135,13.756417,100.49925
1,ดุสิต,75835,13.7728,100.5154
2,หนองจอก,184350,13.85125,100.856125
3,บางรัก,42823,13.728,100.5232
4,บางเขน,184496,13.867,100.628


In [21]:
# Rename Total column to Population
df_merge.rename(columns = {'Total': 'Population'}, inplace = True)
df_merge.head()

Unnamed: 0,District,Population,lat,long
0,พระนคร,40135,13.756417,100.49925
1,ดุสิต,75835,13.7728,100.5154
2,หนองจอก,184350,13.85125,100.856125
3,บางรัก,42823,13.728,100.5232
4,บางเขน,184496,13.867,100.628


# 4. Plot in Plotly

In [70]:
fig = px.scatter_mapbox(df_merge,
                        lat = 'lat',
                        lon = 'long',
                        color = 'Population',
                        size = 'Population',
                        hover_name = 'District',
                        hover_data = ['Population'],
                        color_continuous_scale = px.colors.diverging.balance_r,
                        size_max = 30,
                        width = 800,
                        height = 500)
fig.update_layout(mapbox_style = 'open-street-map',
                  margin = dict(l = 20, r = 20, t = 40, b = 20),
                  title = {'text':'Number of Population in Bangkok by District',
                           'font': {'size': 20},
                           'x': 0.025,
                           'y': 0.9675})
fig.update
fig.show()