In [8]:
import csv
import json
import xlrd
import zipfile
import requests
import functools
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [9]:
def get_json(url):
    """
    Request a HTTP GET method to the given url (for REST API)
    and return its response as the dict object.

    Args:
    ====
    url: string
        valid url for REST API
    """
    try:
        print("HTTP GET", url)
        r = requests.get(url)
        json_dict = r.json()
        return json_dict
    except requests.exceptions.RequestException as error:    
        print(error)

In [10]:
def download_csv(url, filepath, enc="utf-8", dec="utf-8", logging=False):
    """
    Request a HTTP GET method to the given url (for REST API)
    and save its response as the csv file.

    url: string
        valid url for REST API
    filepathe: string
        valid path to the destination file
    enc: string
        encoding type for a content in a given url
    dec: string
        decoding type for a content in a downloaded file
            dec = 'utf-8' for general env
            dec = 'sjis'  for Excel on Win
            dec = 'cp932' for Excel with extended JP str on Win
    logging: True/False
        flag whether putting process log
    """
    try:
        if logging:
            print("HTTP GET", url)
        r = requests.get(url, stream=True)
        with open(filepath, 'w', encoding=enc) as f:
            f.write(r.content.decode(dec))
    except requests.exceptions.RequestException as error:
        print(error)


def download_all_csv(
        urls,
        filepathes,
        max_workers=10,
        enc="utf-8",
        dec="utf-8"):
    """
    Request some HTTP GET methods to the given urls (for REST API)
    and save each response as the csv file.
    (!! This method uses multi threading when calling HTTP GET requests
    and downloading files in order to improve the processing speed.)

    urls: list of strings
        valid urls for REST API
    filepathes: list of strings
        valid pathes to the destination file
    max_workers: int
        max number of working threads of CPUs within executing this method.
    enc: string
        encoding type for a content in a given url
    dec: string
        decoding type for a content in a downloaded file
            dec = 'utf-8' for general env
            dec = 'sjis'  for Excel on Win
            dec = 'cp932' for Excel with extended JP str on Win
    logging: True/False
    """
    func = functools.partial(download_csv, enc=enc, dec=dec)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(
            tqdm(executor.map(func, urls, filepathes), total=len(urls))
        )
        del results


In [11]:
import os
from pprint import pprint
import requests
from estat_api import EstatRestAPI_URLParser
estatapi_url_parser = EstatRestAPI_URLParser()  # URL Parser

def search_tables():
    """
    Prams (dictionary) to search eStat tables.
    For more details, see also
    https://www.e-stat.go.jp/api/api-info/e-stat-manual3-0#api_3_2

        - appId: Application ID (*required)
        - lang: 言語(J:日本語, E:英語)
        - surveyYears: 調査年月 (YYYYY or YYYYMM or YYYYMM-YYYYMM)
        - openYears: 調査年月と同様
        - statsField: 統計分野 (2桁:統計大分類, 4桁:統計小分類)
        - statsCode: 政府統計コード (8桁)
        - searchWord: 検索キーワード
        - searchKind: データの種別 (1:統計情報, 2:小地域・地域メッシュ)     
        - collectArea: 集計地域区分 (1:全国, 2:都道府県, 3:市区町村)        
        - explanationGetFlg: 解説情報有無(Y or N)
        - ...
    """
    params_dict = {
        "appId": estatapi_url_parser.app_id,
        "lang": "J",
        "statsCode": "00550020",
        # "searchWord": "商業統計調査",  # "統計でみる市区町村のすがた",
        "searchKind": 1,
        # "collectArea": 3,
        "explanationGetFlg": "N"
    }

    url = estatapi_url_parser.getStatsListURL(params_dict, format="json")   
    json_dict = get_json(url)
    # pprint(json_dict)

    if json_dict['GET_STATS_LIST']['DATALIST_INF']['NUMBER'] != 0:
        tables = json_dict["GET_STATS_LIST"]["DATALIST_INF"]["TABLE_INF"]
    else:
        tables = []
    return tables


def parse_table_id(table):
    return table["@id"]


def parse_table_raw_size(table):
    return table["OVERALL_TOTAL_NUMBER"]


def parse_table_urls(table_id, table_raw_size, csv_raw_size=100000):
    urls = []
    for j in range(0, int(table_raw_size / csv_raw_size) + 1):
        start_pos = j * csv_raw_size + 1
        params_dict = {
            "appId": estatapi_url_parser.app_id,  # Application ID
            "lang": "J",  # 言語 (J: 日本語, E: 英語)
            "statsDataId": str(table_id),  # 統計表ID
            "startPosition": start_pos,  # 開始行
            "limit": csv_raw_size,  # データ取得件数
            "explanationGetFlg": "N",  # 解説情報有無(Y or N)
            "annotationGetFlg": "N",  # 注釈情報有無(Y or N)
            "metaGetFlg": "N",  # メタ情報有無(Y or N)
            "sectionHeaderFlg": "2",  # CSVのヘッダフラグ(1:取得, 2:取得無)
        }
        url = estatapi_url_parser.getStatsDataURL(params_dict, format="csv")
        urls.append(url)
    return urls


if __name__ == '__main__':
    CSV_RAW_SIZE = 100000

    # list of tables
    tables = search_tables()

    # extract all table ids
    if len(tables) == 0:
        print("No tables were found.")
    elif len(tables) == 1:
        table_ids = [parse_table_id(tables[0])]
    else:
        table_ids = list(map(parse_table_id, tables))

    # list of urls
    table_urls = []
    table_raw_size = list(map(parse_table_raw_size, tables))
    for i, table_id in enumerate(table_ids):
        table_urls = table_urls + parse_table_urls(table_id, table_raw_size[i])

    # list of filepathes
    filepathes = []
    for i, table_id in enumerate(table_ids):
        table_name = tables[i]["TITLE_SPEC"]["TABLE_NAME"]
        table_dir = f"../data/{table_name}_{table_id}"
        os.makedirs(table_dir, exist_ok=True)
        for j in range(0, int(table_raw_size[i] / CSV_RAW_SIZE) + 1):
            filepath = f"{table_dir}/{table_name}_{table_id}_{j}.csv"
            filepathes.append(filepath)

    download_all_csv(table_urls, filepathes, max_workers=30)

HTTP GET https://api.e-stat.go.jp/rest/3.0/app/json/getStatsList?appId=d6c94977d11c6f2f1c3af613bc83a1c7a93941b1&lang=J&statsCode=00550020&searchKind=1&explanationGetFlg=N


100%|██████████| 197/197 [00:22<00:00,  8.75it/s]
