In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd

from bs4 import BeautifulSoup
import requests
import time, os

### Import 2021 demographic data (DATA-SMART CITY SAPPORO)

In [None]:
# Import CSV from Sapporo municipal gov't, after editing in Excel (ensure 'header=1')
# Source: DATA-SMART CITY SAPPORO (町名・条丁目別世帯数及び男女別人口 令和3年（2021年）4月1日現在.csv)

df = pd.read_csv('町名・条丁目別_export.csv', header=1)
df

In [None]:
# Check individual entries
# df[df['町条丁目'] == "宮の森一条１０丁目"] 

### Import shape data (Geoshape Repository)

In [None]:
# Import SHP from Geoshape Repository > 国勢調査町丁・字等別境界データ
# Source: https://geoshape.ex.nii.ac.jp/ka/

dfgeo_japan = gpd.read_file('h27ka01.shp')
dfgeo_sapp = dfgeo_japan.iloc[:5796, :]  #  清田区→里塚緑ケ丘１２丁目 = last entry for Sapporo City

dfgeo_sapp.to_csv('GISrefdata_for_checking.csv')

In [None]:
# Check individual entries
# dfgeo_sapp[dfgeo_sapp['S_NAME'] == "宮の森（番地）"]

### Join datasets on neighborhood

In [None]:
# Join datasets on ['区別', '町条丁目'], ['CITY_NAME', "S_NAME"]

joined_df = pd.merge(df, dfgeo_sapp, left_on=['区別', '町条丁目'], right_on=['CITY_NAME', 'MOJI'], how='left')
# new_df = joined_df[joined_df['KEY_CODE'].notna()]
joined_df

In [None]:
# Export for checking in Numbers
# (8-Oct) Some gaps, but mostly clean

joined_df.to_csv('joined_dataframe_for_checking_in_excel.csv')　# in Numbers

In [None]:
# Remove rows that failed to match
# (<1% of rows; fix later)

new_df = joined_df[joined_df['MOJI'].notna()].reset_index(drop=True)
new_df

In [None]:
new_df.to_csv('sapporo_df_with_geodata.csv')

### Scrape features from ward/district-level pages on apartment hunting website Sumaiti (BeautifulSoup)

In [None]:
import random
from fake_useragent import UserAgent

url = 'https://sumaity.com/town/hokkaido/sapporo/chuo_ku/'

ua = UserAgent()
user_agent = {'User-agent': ua.random}
response = requests.get(url, headers = user_agent)                       
page = response.text
soup = BeautifulSoup(page, "lxml")

In [None]:
# Instantiate lists

names = []
popden = []
LTCcover = []
income = []
land = []
residarea = []

rent_studio = []
rent_1bdrm = []

In [None]:
def get_ward_data(url)

# Get name of ward
output = soup.find('title').text
name = output.split('の')[0]
names.append(name)

# Get population density (人口密度)
output = str(soup.findAll(class_='mapBtn')[1])  # 2nd entry
feat1 = output.split('</em>')[1].split('<span>')[0].replace(',', '')
popden.append(feat1)

# Percentage residents in long-term-care facilities (介護施設カバー率; %)
output = str(soup.findAll(class_='mapBtn')[4])  # 5th entry
feat2 = output.split('</em>')[1].split('<span>')[0].replace(',', '')
LTCcover.append(feat2)

# Mean annual income (平均所得; 万円)
output = str(soup.findAll(class_='mapBtn')[5])  # 6th entry
feat3 = output.split('</em>')[1].split('<span>')[0].replace(',', '')
income.append(feat3)

# Price of land (地価; 円/m2)
output = str(soup.findAll(class_='mapBtn')[6])  # 7th entry
feat5 = output.split('</em>')[1].split('<span>')[0].replace(',', '')
land.append(feat5)

# Residence area (住宅面積; m2)
output = str(soup.findAll(class_='mapBtn')[7])  # 8th entry
feat6 = output.split('</em>')[1].split('<span>')[0].replace(',', '')
residarea.append(feat6)

In [None]:
# Rent price data

rent_table = soup.find('table').findAll('tr')

studio_yen = rent_table[1].find('td', class_='price').text  # 2nd row, studio
studio_num = float(studio_yen[:-2]) * 10000    #
rent_studio.append(studio_num)

onebed_yen = rent_table[3].find('td', class_='price').text  # 2nd row, studio
onebed_num = float(onebed_yen[:-2]) * 10000    #
rent_1bdrm.append(onebed_num)

In [None]:
# 4. Pause like a human

wait = .5 + 10 * random.random()
time.sleep(wait)
#print(f'{url[41:]}, {no_prices}, ${COL_sin1}, waited {wait:0.4} sec.')

In [None]:
# Error occurred mid-page if all four not same length -> re-scrape

print(len(rent_85m2_expens), len(mobile_wifi_128gb), len(list_COL_sin1), len(list_n_prices))

In [None]:
# Concatenate into full COL dataset

list_tuples2 = list(zip(rent_85m2_expens,
                      rent_85m2_normal,
                      rent_45m2_expens,
                      rent_45m2_normal,
                      eatout_lunch,
                      eatout_dinner,
                      taxi_5mi,
                      gas_liter,
                      pubtrans_monthly,
                      internet_monthly,
                      TV_40in,
                      cappuccino,
                      mobile_wifi_128gb))

list_tuples3 = list(zip(list_COL_sin1,
                       list_COL_fam4))

list_tuples4 = list(zip(list_n_prices,
                      list_n_people))

df2 = pd.DataFrame(list_tuples2, columns = list_feature_names)
df3 = pd.DataFrame(list_tuples3, columns = ['COL (Family of 4)',
                                            'COL (Single of 1)'])
df4 = pd.DataFrame(list_tuples4, columns = ['No. Prices',
                                            'No. Contributors'])

In [None]:
# df1_split = df1[1781:].reset_index(drop=True)  #resetting renames
# df1_split

In [None]:
# check df1 name before joining

df_cities_test = pd.concat([df1_split, df2, df3, df4], axis=1, ignore_index=True)
df_cities_test

In [None]:
df_cities = pd.concat([df1_split, df2, df3, df4], axis=1, ignore_index=True)
# df_allcities = pd.concat([df1, df2, df3, df4], axis=1) # once 
df_cities.tail(20)

In [None]:
# Save your progress to disk!!

# df_cities1781_2474 = df_cities.copy()
# df_cities1781_2474.to_pickle('df_cities1781_2474')

# For quick recovery

# df_cities = pd.read_pickle(df_cities981_1780.pkl)

In [None]:
print(df_cities000_980.shape,  # last city: Pontianak, Indonesia
    df_cities981_1780.shape,   # last city: Sao Tome, ST&P
    df_cities1781_2474.shape)  # last city: Zvishavane District, Zimbabwe

In [None]:
df_allcities

In [None]:
# Save your work

df_allcities.to_pickle('df_allcities')
df_allcities.to_csv('allcitiescsvcopy', index=False)

# For quick recovery

# df_allcities = pd.read_pickle(df_allcities.pkl)
# df_allcities = pd.read_csv('allcitiescsvcopy')