# 1. Import Libraries

In [195]:
import warnings
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import plotly.express as px
import plotly.graph_objects as go

warnings.filterwarnings('ignore')

# 2. Load Data

In [72]:
# Specify file paths
location_file = 'https://raw.githubusercontent.com/prattapong/DADS5001/main/Homework%203/data/tambon.csv'
population_file = 'https://raw.githubusercontent.com/prattapong/DADS5001/main/Homework%203/data/bangkok_population.csv'

# Load as DataFrame
df_location = pd.read_csv(location_file)
df_population = pd.read_csv(population_file)

In [116]:
url = "https://e-report.energy.go.th/area/Bangkok.htm"
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', class_ = 'MsoNormalTable')

# Extract table rows and columns
data = []
rows = table.find_all('tr')
for row in rows:
    columns = row.find_all('td')
    data.append([column.text.strip().encode('latin1').decode('tis-620') for column in columns])

df_area = pd.DataFrame(data)

In [107]:
# Print shape and display first 5 rows
print(f'Location DataFrame Shape: {df_location.shape}')
display(df_location.head())
print(f'Bangkok Population DataFrame Shape: {df_population.shape}')
display(df_population.head())
print(f'District Area DataFrame Shape: {df_area.shape}')
display(df_area.head())

Location DataFrame Shape: (7768, 12)


Unnamed: 0,AD_LEVEL,TA_ID,TAMBON_T,TAMBON_E,AM_ID,AMPHOE_T,AMPHOE_E,CH_ID,CHANGWAT_T,CHANGWAT_E,LAT,LONG
0,4.0,910106.0,ต. เกาะสาหร่าย,Ko Sarai,9101.0,,Mueang Satun,91.0,จ. สตูล,Satun,6.546,99.706
1,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.646,101.171
2,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.645,101.17
3,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.649,101.174
4,4.0,210114.0,ต. มาบตาพุด,Maptaphut,2101.0,อ. เมืองระยอง,Mueang Rayong,21.0,จ. ระยอง,Rayong,12.644,101.169


Bangkok Population DataFrame Shape: (50, 4)


Unnamed: 0,District,Male,Female,Total
1,พระนคร,19381,20754,40135
2,ดุสิต,40325,35510,75835
3,หนองจอก,89451,94899,184350
4,บางรัก,20064,22759,42823
5,บางเขน,85943,98553,184496


District Area DataFrame Shape: (211, 3)


Unnamed: 0,0,1,2
0,ลำดับ,อำเภอ/กิ่งอำเภอ,เนื้อที่ (ตร.กม.)
1,1,เขตคลองเตย,12.99
2,,แขวงคลองเตย,7.25
3,,แขวงคลองตัน,1.90
4,,แขวงพระโขนง,3.85


# 3. Data Cleansing

## 3.1 Clean Location Data

In [74]:
# Check unique CHANGWAT_E
df_location['CHANGWAT_E'].unique()

array(['Satun', 'Rayong', 'Chumphon', 'Maha Sarakham', 'Surat Thani',
       'Chon Buri', 'Trang', 'Chanthaburi', 'Phang-nga', 'Trat', 'Ranong',
       'Phuket', 'Krabi', 'Bangkok', 'Nakhon Si Thammarat',
       'Samut Sakhon', 'Phra Nakhon Si Ayutthaya', 'Lop Buri',
       'Samut Prakarn', 'Ang Thong', 'Pattani', 'Chiang Mai', 'Chai Nat',
       'Nonthaburi', 'Uttaradit', 'Khon Kaen', 'Nakhon Sawan',
       'Phetchaburi', 'Sing Buri', 'Samut Songkhram', 'Ratchaburi',
       'Nakhon Nayok', 'Saraburi', 'Nakhon Pathom', 'Kanchanaburi',
       'Pathum Thani', 'Phitsanulok', 'Chachoengsao', 'Chiang Rai',
       'Songkhla', 'Lamphun', 'Nong Khai', 'Ubon Ratchathani',
       'Sukhothai', 'Phayao', 'Uthai Thani', 'Phrae', 'Phetchabun',
       'Mae Hong Son', 'Buri Ram', 'Phichit', 'Nan', 'Yala',
       'Suphan Buri', 'Nakhon Ratchasima', 'Prachin Buri', 'Tak',
       'Roi Et', 'Nakhon Phanom', 'Mukdahan', 'Sa kaeo', 'Udon Thani',
       'Lampang', 'Si Sa Ket', 'Yasothon', 'Phatthalung', 'Nar

In [75]:
# There is nan in the list, therefore, we will check for these rows
df_location[df_location['CHANGWAT_E'].isna()]

Unnamed: 0,AD_LEVEL,TA_ID,TAMBON_T,TAMBON_E,AM_ID,AMPHOE_T,AMPHOE_E,CH_ID,CHANGWAT_T,CHANGWAT_E,LAT,LONG
7768,,,,,,,,,,,,


In [76]:
# Since it has NaN in the entire row, we will drop this row
df_location.dropna(subset = ['CHANGWAT_E'],
                   axis = 0,
                   inplace = True)
print(df_location.shape)

(7768, 12)


In [77]:
# Print CHANGWAT_E and its total number
print(sorted(df_location['CHANGWAT_E'].unique()))
print(len(df_location['CHANGWAT_E'].unique()))

['Amnat Charoen', 'Ang Thong', 'Bangkok', 'Bueng Kan', 'Buri Ram', 'Chachoengsao', 'Chai Nat', 'Chaiyaphum', 'Chanthaburi', 'Chiang Mai', 'Chiang Rai', 'Chon Buri', 'Chumphon', 'Kalasin', 'Kamphaeng Phet', 'Kanchanaburi', 'Khon Kaen', 'Krabi', 'Lampang', 'Lamphun', 'Loei', 'Lop Buri', 'Mae Hong Son', 'Maha Sarakham', 'Mukdahan', 'Nakhon Nayok', 'Nakhon Pathom', 'Nakhon Phanom', 'Nakhon Ratchasima', 'Nakhon Sawan', 'Nakhon Si Thammarat', 'Nan', 'Narathiwat', 'Nong Bua Lam Phu', 'Nong Khai', 'Nonthaburi', 'Pathum Thani', 'Pattani', 'Phang-nga', 'Phatthalung', 'Phayao', 'Phetchabun', 'Phetchaburi', 'Phichit', 'Phitsanulok', 'Phra Nakhon Si Ayutthaya', 'Phrae', 'Phuket', 'Prachin Buri', 'Prachuap Khiri Khan', 'Ranong', 'Ratchaburi', 'Rayong', 'Roi Et', 'Sa kaeo', 'Sakon Nakhon', 'Samut Prakarn', 'Samut Sakhon', 'Samut Songkhram', 'Saraburi', 'Satun', 'Si Sa Ket', 'Sing Buri', 'Songkhla', 'Sukhothai', 'Suphan Buri', 'Surat Thani', 'Surin', 'Tak', 'Trang', 'Trat', 'Ubon Ratchathani', 'Udon T

In [78]:
# Slice DataFrame to only Bangkok
df_bangkok = df_location[df_location['CHANGWAT_E'] == 'Bangkok']
print(f'Bangkok Location DataFrame Shape: {df_bangkok.shape}')
display(df_bangkok.head())

Bangkok Location DataFrame Shape: (154, 12)


Unnamed: 0,AD_LEVEL,TA_ID,TAMBON_T,TAMBON_E,AM_ID,AMPHOE_T,AMPHOE_E,CH_ID,CHANGWAT_T,CHANGWAT_E,LAT,LONG
199,4.0,100106.0,แขวง เสาชิงช้า,Sao Chingcha,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.753,100.5
202,4.0,100105.0,แขวง ศาลเจ้าพ่อเสือ,San Chaopho Suea,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.754,100.497
214,4.0,100108.0,แขวง ตลาดยอด,Talat Yot,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.76,100.498
224,4.0,100103.0,แขวง วัดราชบพิธ,Wat Ratchabophit,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.75,100.499
243,4.0,100104.0,แขวง สำราญราษฎร์,Samran Rat,1001.0,เขต พระนคร,Phra Nakhon,10.0,กรุงเทพมหานคร,Bangkok,13.751,100.503


In [79]:
# Get mean latitude and longitude of each district
df_latlong = df_bangkok.groupby('AMPHOE_T', as_index = False)[['LAT','LONG']].mean()

# Replace the word "เขต" in the column AMPHOE_T and trim
df_latlong['AMPHOE_T'] = df_latlong['AMPHOE_T'].str.replace('เขต', '').str.strip()

# Rename columns
df_latlong.columns = ['District', 'lat', 'long']
df_latlong.head()

Unnamed: 0,District,lat,long
0,คลองสาน,13.7265,100.5025
1,คลองสามวา,13.8678,100.7398
2,คลองเตย,13.713333,100.578667
3,คันนายาว,13.821,100.677
4,จตุจักร,13.826,100.565


In [80]:
# Check Dtypes
df_latlong.dtypes

District     object
lat         float64
long        float64
dtype: object

## 3.2 Clean Population Data

In [82]:
# Since column names of df_population are in Thai, we will rename the column names first
df_population.columns = ['District', 'Male', 'Female', 'Total']

In [83]:
# Check missing values
print(df_population.isna().sum())
display(df_population[df_population.isna().any(axis = 1)])

District    3
Male        3
Female      3
Total       3
dtype: int64


Unnamed: 0,District,Male,Female,Total
51,,,,
52,,,,
53,,,,


In [84]:
# Drop NaN
df_population.dropna(axis = 0, inplace = True)

In [85]:
# Remove the word "ท้องถิ่นเขต" from District column and trim
df_population['District'] = df_population['District'].str.replace('ท้องถิ่นเขต', '').str.strip()

In [86]:
# Remove total row
df_population = df_population[~df_population['District'].str.contains('ยอดรวม')]

In [87]:
# Check Dtypes
df_population.dtypes

District    object
Male        object
Female      object
Total       object
dtype: object

In [88]:
# Cast numerical column as integer
for col in df_population.columns:
    if col != 'District':
        df_population[col] = df_population[col].str.replace(',', '').astype('int')
df_population.dtypes

District    object
Male         int32
Female       int32
Total        int32
dtype: object

## 3.3 Clean District Area Data

In [131]:
# Remove row of column name and column ลำดับ
if df_area.iloc[0,0] == 'ลำดับ':
    df_area = df_area.iloc[1:,1:]

# Set column name
df_area.columns = ['District', 'Area (sq. km)']

# Slice only rows with the word เขต
df_area = df_area[df_area['District'].apply(lambda x: 'เขต' in x)]

# Remove the word เขต
df_area['District'] = df_area['District'].str.replace('เขต', '')

df_area.head()

Unnamed: 0,District,Area (sq. km)
1,คลองเตย,12.99
5,คลองสาน,6.05
10,คลองสามวา,110.69
16,คันนายาว,25.98
18,จตุจักร,32.91


In [139]:
# Check Dtypes
df_area.dtypes

District         object
Area (sq. km)    object
dtype: object

In [140]:
# Cast Area column as float
df_area['Area (sq. km)'] = df_area['Area (sq. km)'].str.replace(',', '').astype('float')
df_area.dtypes

District          object
Area (sq. km)    float64
dtype: object

## 3.4 Merge DataFrames

In [141]:
# Check the format of both DataFrame wheter they match to merge or not
display(df_population.sort_values(by = 'District').head())
display(df_latlong.sort_values(by = 'District').head())
display(df_area.sort_values(by = 'District').head())

Unnamed: 0,District,Male,Female,Total
18,คลองสาน,29795,35010,64805
46,คลองสามวา,99482,112649,212131
33,คลองเตย,42190,47208,89398
43,คันนายาว,44193,50876,95069
30,จตุจักร,70422,83547,153969


Unnamed: 0,District,lat,long
0,คลองสาน,13.7265,100.5025
1,คลองสามวา,13.8678,100.7398
2,คลองเตย,13.713333,100.578667
3,คันนายาว,13.821,100.677
4,จตุจักร,13.826,100.565


Unnamed: 0,District,Area (sq. km)
5,คลองสาน,6.05
10,คลองสามวา,110.69
1,คลองเตย,12.99
16,คันนายาว,25.98
18,จตุจักร,32.91


In [142]:
df_merge = df_population.merge(right = df_area,
                               how = 'left',
                               on = 'District').merge(right = df_latlong,
                                                      how = 'left',
                                                      on = 'District')
print(df_merge.isna().sum())
display(df_merge.head())

District         0
Male             0
Female           0
Total            0
Area (sq. km)    0
lat              0
long             0
dtype: int64


Unnamed: 0,District,Male,Female,Total,Area (sq. km),lat,long
0,พระนคร,19381,20754,40135,5.54,13.756417,100.49925
1,ดุสิต,40325,35510,75835,10.67,13.7728,100.5154
2,หนองจอก,89451,94899,184350,236.26,13.85125,100.856125
3,บางรัก,20064,22759,42823,5.54,13.728,100.5232
4,บางเขน,85943,98553,184496,42.12,13.867,100.628


In [143]:
# Rename Total column to Population
df_merge.rename(columns = {'Total': 'Population'}, inplace = True)

In [145]:
df_merge['Male Ratio'] = df_merge.apply(lambda x: x['Male'] / x['Population'], axis = 1)
df_merge['People per sq. km'] = df_merge.apply(lambda x: x['Population'] / x['Area (sq. km)'], axis = 1)
df_merge.head()

Unnamed: 0,District,Male,Female,Population,Area (sq. km),lat,long,Male Ratio,People per sq. km
0,พระนคร,19381,20754,40135,5.54,13.756417,100.49925,0.482895,7244.584838
1,ดุสิต,40325,35510,75835,10.67,13.7728,100.5154,0.531747,7107.310216
2,หนองจอก,89451,94899,184350,236.26,13.85125,100.856125,0.485224,780.284432
3,บางรัก,20064,22759,42823,5.54,13.728,100.5232,0.468533,7729.783394
4,บางเขน,85943,98553,184496,42.12,13.867,100.628,0.465826,4380.246914


# 4. Analyze and Plot with Plotly

In [147]:
df_merge.describe()

Unnamed: 0,Male,Female,Population,Area (sq. km),lat,long,Male Ratio,People per sq. km
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,51084.6,58304.04,109388.64,31.3758,13.755893,100.556705,0.467703,6015.412268
std,22659.778115,25692.077219,48271.63979,40.214137,0.069119,0.105864,0.013946,3485.523207
min,9440.0,10011.0,19451.0,1.42,13.58,100.3545,0.445918,780.284432
25%,36512.5,40010.5,76258.25,10.7325,13.7135,100.499562,0.460406,4306.917629
50%,46002.0,52489.0,98206.0,19.18,13.7536,100.538625,0.465898,5184.659172
75%,65784.25,74741.5,140958.25,34.29,13.78575,100.6235,0.471469,7210.266182
max,99482.0,112649.0,212131.0,236.26,13.925,100.856125,0.531747,19678.756477


## 4.1 Understanding Population in each district

In [188]:
fig = px.scatter_mapbox(df_merge,
                        lat = 'lat',
                        lon = 'long',
                        color = 'Population',
                        size = 'Population',
                        hover_name = 'District',
                        hover_data = ['Population'],
                        color_continuous_scale = px.colors.diverging.balance_r,
                        size_max = 30,
                        width = 800,
                        height = 500)
fig.update_layout(mapbox_style = 'open-street-map',
                  margin = dict(l = 20, r = 20, t = 40, b = 20),
                  title = {'text':'Number of Population in Bangkok by District',
                           'font': {'size': 20},
                           'x': 0.025,
                           'y': 0.9675})
fig.show()

In the scatter map visualization above, it is obvious that population density in central districts is notably lower compared to their surrounding districts. However, some districts, such as Bangbon and Taweewattana, exhibit a relatively small number of population.

In [166]:
fig = px.bar(df_merge.sort_values(by = 'Population', ascending = False).head(5).sort_values(by = 'Population'), 
             x = 'Population', 
             y = 'District', 
             orientation = 'h',
             width = 800,
             height = 500)
fig.update_layout(margin = dict(l = 20, r = 20, t = 40, b = 20),
                  title = {'text':'Top 5 Districts by Population',
                           'font': {'size': 20},
                           'x': 0.025,
                           'y': 0.9675})
fig.show()

Top 5 districts have more than 180,000 people in each district. Khlongsamwa is the district which has the most people leaving

In [172]:
fig = px.bar(df_merge.sort_values(by = 'Population').head(5), 
             x = 'Population', 
             y = 'District', 
             orientation = 'h',
             color_discrete_sequence = ['lightcoral'],
             width = 800,
             height = 500)
fig.update_layout(margin = dict(l = 20, r = 20, t = 40, b = 20),
                  title = {'text':'Lowest 5 Districts by Population',
                           'font': {'size': 20},
                           'x': 0.025,
                           'y': 0.9675})
fig.show()

Lowest 5 districts have people living less than 45,000 people.
Sampanthawong has the least people living with a number of 19,451 which is more than 10 times less than the most people living district (Khlongsamwa) 

## 4.2 Male proportion in each district 

In [183]:
fig = px.scatter_mapbox(df_merge,
                        lat = 'lat',
                        lon = 'long',
                        color = 'Male Ratio',
                        size = 'Population',
                        hover_name = 'District',
                        hover_data = ['Male Ratio'],
                        color_continuous_scale = px.colors.diverging.RdBu,
                        size_max = 30,
                        width = 800,
                        height = 500)
fig.update_layout(mapbox_style = 'open-street-map',
                  margin = dict(l = 20, r = 20, t = 40, b = 20),
                  title = {'text':'Male Ratio in each District',
                           'font': {'size': 20},
                           'x': 0.025,
                           'y': 0.9675})
fig.show()

In [194]:
df_merge[df_merge['Male Ratio'] > 0.5]

Unnamed: 0,District,Male,Female,Population,Area (sq. km),lat,long,Male Ratio,People per sq. km
1,ดุสิต,40325,35510,75835,10.67,13.7728,100.5154,0.531747,7107.310216


There is only one district which is Dusit that has more men than women

## 4.3 Understanding the density of population per area of each district

In [193]:
fig = px.scatter_mapbox(df_merge,
                        lat = 'lat',
                        lon = 'long',
                        color = 'People per sq. km',
                        size = 'Area (sq. km)',
                        hover_name = 'District',
                        hover_data = ['People per sq. km'],
                        color_continuous_scale = px.colors.diverging.balance,
                        size_max = 80,
                        width = 800,
                        height = 500)
fig.update_layout(mapbox_style = 'open-street-map',
                  margin = dict(l = 20, r = 20, t = 40, b = 20),
                  title = {'text':'Population per sq. km in Bangkok by District',
                           'font': {'size': 20},
                           'x': 0.025,
                           'y': 0.9675})
fig.show()

In [202]:
df_merge[df_merge['People per sq. km'] > (2.5 * np.quantile(df_merge['People per sq. km'], 0.75)) - (1.5 * np.quantile(df_merge['People per sq. km'], 0.25))]

Unnamed: 0,District,Male,Female,Population,Area (sq. km),lat,long,Male Ratio,People per sq. km
7,ป้อมปราบศัตรูพ่าย,18523,19457,37980,1.93,13.7512,100.5114,0.487704,19678.756477
12,สัมพันธวงศ์,9440,10011,19451,1.42,13.738,100.509333,0.485322,13697.887324
25,ดินแดง,50057,59651,109708,8.35,13.778,100.567,0.456275,13138.682635


Pom Prap Sattruphai and Samphanthawong rank among the five districts with the lowest population counts. With their small geographical areas, only around 1 to 2 square kilometers each, these districts are characterized by high population density, making them among the most densely populated areas in the region.