In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

from bs4 import BeautifulSoup
import requests
import time, os

pd.options.mode.chained_assignment = None

### Import 2021 demographic data (DATA-SMART CITY SAPPORO)

In [2]:
# Import CSV from Sapporo municipal gov't, after editing in Excel (ensure 'header=1')
# Source: DATA-SMART CITY SAPPORO (町名・条丁目別世帯数及び男女別人口 令和3年（2021年）4月1日現在.csv)

df = pd.read_csv('町名・条丁目別_export.csv', header=1)
df.columns = ['Ward', 'Block', 'N_Households_2021',
              "Population_2021", "Male", "Female"]

In [None]:
# For checking individual entries
# df[df['町条丁目'] == "宮の森一条１０丁目"] 

### Import shape data (Geoshape Repository)

In [3]:
# Import SHP from Geoshape Repository > 国勢調査町丁・字等別境界データ
# Source: https://geoshape.ex.nii.ac.jp/ka/

dfgeo_japan = gpd.read_file('h27ka01.shp')
dfgeo_sapp = dfgeo_japan.iloc[:5796, :]  #  清田区→里塚緑ケ丘１２丁目 = last entry for Sapporo City

In [None]:
# For checking individual entries
# dfgeo_sapp[dfgeo_sapp['S_NAME'] == "宮の森（番地）"]

In [None]:
# Save Sapporo data as CSV

dfgeo_sapp.to_csv('GISrefdata_for_checking.csv')

In [4]:
# Reduce feature for data analysis

dfgeo_sapp_cut = dfgeo_sapp[["KEY_CODE",   # unique ID for join in Tableau
                            "PREF_NAME", "CITY_NAME", "S_NAME", "KIGO_E",
                            "AREA", "JINKO", "SETAI",
                            "X_CODE", "Y_CODE"]]

dfgeo_sapp_cut.rename(columns={'JINKO': 'Population_2015', 'SETAI': 'N_Households_2015', 'KIGO_E': 'Kigo_E',
                              'PREF_NAME': 'Pref_ref', 'CITY_NAME': 'Ward_ref', 'S_NAME': 'Block_ref',
                              'AREA': 'Area', "X_CODE": 'Lat', 'Y_CODE': 'Lng'}, inplace=True)

### Join datasets on neighborhood

In [5]:
# Join datasets on ['Ward', 'Block'], ['CITY_NAME', "S_NAME"]

joined_df = pd.merge(df, dfgeo_sapp_cut, left_on=['Ward', 'Block'], right_on=['Ward_ref', 'Block_ref'], how='left')
# new_df = joined_df[joined_df['KEY_CODE'].notna()]
new_df = joined_df[["KEY_CODE",   # unique ID for join in Tableau
                    "Ward", "Block", "Pref_ref", "Ward_ref", "Block_ref", "Kigo_E",
                    "N_Households_2015", "N_Households_2021",
                    "Population_2021", "Population_2015",
                    "Male", "Female", "Area", "Lat", "Lng"]]

### Derived features

In [6]:
# Population density
new_df['PopDen_2015'] = new_df['Population_2015'] / new_df['Area']
new_df['PopDen_2021'] = new_df['Population_2021'] / new_df['Area']

# Gender ratio
new_df['Sex_ratio'] = new_df['Female'] / (new_df['Female'] + new_df['Male'])

# People per household
# Greater values = Families, multigenerational households
new_df["Household_size_2015"] = new_df['Population_2015'] / new_df['N_Households_2015']
new_df["Household_size_2021"] = new_df['Population_2021'] / new_df['N_Households_2021']

new_df

Unnamed: 0,KEY_CODE,Ward,Block,Pref_ref,Ward_ref,Block_ref,Kigo_E,N_Households_2015,N_Households_2021,Population_2021,...,Male,Female,Area,Lat,Lng,PopDen_2015,PopDen_2021,Sex_ratio,Household_size_2015,Household_size_2021
0,01101200002,中央区,大通東２丁目,北海道,中央区,大通東２丁目,,142.0,147,276,...,127,149,27586.502,141.35944,43.06220,0.009860,0.010005,0.539855,1.915493,1.877551
1,01101200003,中央区,大通東３丁目,北海道,中央区,大通東３丁目,,267.0,273,414,...,214,200,26893.377,141.36100,43.06240,0.012754,0.015394,0.483092,1.284644,1.516484
2,01101200004,中央区,大通東４丁目,北海道,中央区,大通東４丁目,,167.0,181,290,...,128,162,26602.436,141.36253,43.06261,0.013119,0.010901,0.558621,2.089820,1.602210
3,01101200005,中央区,大通東５丁目,北海道,中央区,大通東５丁目,,143.0,225,264,...,117,147,28813.286,141.36423,43.06284,0.006733,0.009162,0.556818,1.356643,1.173333
4,01101200006,中央区,大通東６丁目,北海道,中央区,大通東６丁目,,97.0,135,266,...,111,155,26944.193,141.36578,43.06299,0.006829,0.009872,0.582707,1.896907,1.970370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5593,01109480005,手稲区,明日風５丁目,北海道,手稲区,明日風５丁目,,317.0,414,1238,...,610,628,147336.933,141.23002,43.14310,0.007384,0.008403,0.507270,3.432177,2.990338
5594,01109480006,手稲区,明日風６丁目,北海道,手稲区,明日風６丁目,,38.0,42,127,...,57,70,98540.050,141.23316,43.14167,0.001329,0.001289,0.551181,3.447368,3.023810
5595,,手稲区,明日風,,,,,,1458,4400,...,2167,2233,,,,,,0.507500,,3.017833
5596,,手稲区,合 計,,,,,,70509,141918,...,66871,75047,,,,,,0.528805,,2.012764


In [None]:
# Export for checking in Numbers
# (10-Oct) Some gaps, but mostly clean

new_df.to_csv('joined_dataframe_for_checking_in_numbers.csv')

In [7]:
# Remove rows that failed to match (<1% of rows; fix later)

new_df = new_df[new_df['KEY_CODE'].notna()].reset_index(drop=True)

# Convert ID to string for matching in Tableau

new_df['KEY_CODE'] = [str(id_no) for id_no in new_df.KEY_CODE]
new_df

Unnamed: 0,KEY_CODE,Ward,Block,Pref_ref,Ward_ref,Block_ref,Kigo_E,N_Households_2015,N_Households_2021,Population_2021,...,Male,Female,Area,Lat,Lng,PopDen_2015,PopDen_2021,Sex_ratio,Household_size_2015,Household_size_2021
0,01101200002,中央区,大通東２丁目,北海道,中央区,大通東２丁目,,142.0,147,276,...,127,149,27586.502,141.35944,43.06220,0.009860,0.010005,0.539855,1.915493,1.877551
1,01101200003,中央区,大通東３丁目,北海道,中央区,大通東３丁目,,267.0,273,414,...,214,200,26893.377,141.36100,43.06240,0.012754,0.015394,0.483092,1.284644,1.516484
2,01101200004,中央区,大通東４丁目,北海道,中央区,大通東４丁目,,167.0,181,290,...,128,162,26602.436,141.36253,43.06261,0.013119,0.010901,0.558621,2.089820,1.602210
3,01101200005,中央区,大通東５丁目,北海道,中央区,大通東５丁目,,143.0,225,264,...,117,147,28813.286,141.36423,43.06284,0.006733,0.009162,0.556818,1.356643,1.173333
4,01101200006,中央区,大通東６丁目,北海道,中央区,大通東６丁目,,97.0,135,266,...,111,155,26944.193,141.36578,43.06299,0.006829,0.009872,0.582707,1.896907,1.970370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5382,01109480002,手稲区,明日風２丁目,北海道,手稲区,明日風２丁目,,189.0,203,677,...,334,343,152076.985,141.22857,43.13916,0.004373,0.004452,0.506647,3.518519,3.334975
5383,01109480003,手稲区,明日風３丁目,北海道,手稲区,明日風３丁目,,167.0,209,636,...,302,334,80237.215,141.22475,43.13927,0.008076,0.007926,0.525157,3.880240,3.043062
5384,01109480004,手稲区,明日風４丁目,北海道,手稲区,明日風４丁目,,224.0,327,894,...,443,451,98352.476,141.22645,43.14186,0.007900,0.009090,0.504474,3.468750,2.733945
5385,01109480005,手稲区,明日風５丁目,北海道,手稲区,明日風５丁目,,317.0,414,1238,...,610,628,147336.933,141.23002,43.14310,0.007384,0.008403,0.507270,3.432177,2.990338


In [None]:
new_df.to_csv('sapporo_df_plus_geodata.csv')

### EDA

In [8]:
# Mask data here
# Limit only districts with population >= 10
df_eda = new_df[((new_df['Population_2021'] >= 10) &
                 (new_df['Sex_ratio'] != 0.0) &
                 (new_df['Sex_ratio'] != 1.0))]

In [9]:
# Top five blocks with greatest gender imbalance in residents

df_eda.sort_values(by='Sex_ratio', ascending=True)

Unnamed: 0,KEY_CODE,Ward,Block,Pref_ref,Ward_ref,Block_ref,Kigo_E,N_Households_2015,N_Households_2021,Population_2021,...,Male,Female,Area,Lat,Lng,PopDen_2015,PopDen_2021,Sex_ratio,Household_size_2015,Household_size_2021
3408,01108540001,厚別区,大谷地東１丁目,北海道,厚別区,大谷地東１丁目,,4.0,27,30,...,26,4,119684.802,141.45026,43.03041,0.000092,0.000251,0.133333,2.750000,1.111111
932,01102241813,北区,北十八条西１３丁目,北海道,北区,北十八条西１３丁目,,6.0,170,170,...,143,27,10175.106,141.33140,43.08028,0.052284,0.016707,0.158824,88.666667,1.000000
505,01101692610,中央区,南二十六条西１０丁目,北海道,中央区,南二十六条西１０丁目,,0.0,177,177,...,148,29,23661.132,141.34674,43.02464,0.000000,0.007481,0.163842,,1.000000
422,01101691805,中央区,南十八条西５丁目,北海道,中央区,南十八条西５丁目,,9.0,7,10,...,8,2,3658.023,141.35447,43.03683,0.004647,0.002734,0.200000,1.888889,1.428571
994,01102242507,北区,北二十五条西７丁目,北海道,北区,北二十五条西７丁目,,38.0,32,43,...,34,9,14146.541,141.33874,43.09153,0.004312,0.003040,0.209302,1.605263,1.343750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3724,01105110001,豊平区,月寒中央通１丁目,北海道,豊平区,月寒中央通１丁目,,9.0,11,15,...,3,12,10796.239,141.39220,43.03788,0.001852,0.001389,0.800000,2.222222,1.363636
4565,01107030101,西区,二十四軒一条１丁目,北海道,西区,二十四軒一条１丁目,,4.0,4,10,...,2,8,38377.716,141.31800,43.07360,0.000261,0.000261,0.800000,2.500000,2.500000
2309,01103233608,東区,北三十六条東８丁目,北海道,東区,北三十六条東８丁目,,18.0,8,10,...,2,8,9576.182,141.35823,43.10584,0.004804,0.001044,0.800000,2.555556,1.250000
128,01101690314,中央区,南三条西１４丁目,北海道,中央区,南三条西１４丁目,,0.0,26,26,...,2,24,4614.702,141.33492,43.05472,0.000000,0.005634,0.923077,,1.000000


In [None]:
# Top five blocks with largest/smallest households

df_eda.sort_values(by='Household_size_2021', ascending=True)