# Sakura Data Processing (Japan Meteorological Agency)
The Japan Meteorological Agency (JMA) has a lot more data than JMC does so I'm using their data for processing. Hopefully they have accompanything temperature data that I can use once I get to the modeling stage, otherwise I might 

Notes:
* When I get to the modeling stage, I will probably have to weight recent weather/temperature data higher than old temperature data because of global climate change impacts.
* Bloom URL's are sakura003_00.html up to sakura003_06.html
* Full Bloom URL's are sakura004_00.html up to sakura004_06.html

The bottom of the pages have some definitions and notes on the data. I'm using those to determine column names and do some of the translations manually. 

For posterity (said notes):

 「*」が現在観測中の地点です。<br>
 代替種目が空欄の地点はそめいよしのを観測しています。<br>
 「-」は観測値なしを表しています。<br>
 平年値は、1981-2010年の平均値です。<br>
 「#」は、前年に観測したことを表します。<br>
 (注）倶知安は1994年までえぞやまざくらを、1995年から2006年までそめいよしのを観測していました。
 
Which rougly translates to

\*: The asteric denotes locations that are currently being observed. <br>
\-: The dash represents no observation value. <br>
平年値: This column is the normal year average from 1981 to 2010. <br>
\#: This represents that the observation was made in the previous year. <br>
注: Until 1994 the observations at Kutchan were Sargent Cherries. From 1995 to 2006 they were Yoshino Cherries.


In [1]:
import pandas as pd
import numpy as np
import urllib
import requests
from bs4 import BeautifulSoup
import lxml
import time
import re
from io import StringIO
from googletrans import Translator

In [2]:
# Workaround because I'm on an old version of pandas
pd.set_option("display.max_colwidth", 10000)

In [3]:
def batch_translation(df, column_src, batch_size=100):
    idx = 0
    while idx < df[column_src].size:
        # Spawn a new translator session to see if that gets past the 429 code from Google.
        translator = Translator()
        translator.raise_Exception = True
        df.loc[idx:idx+batch_size-1,column_src] = df.loc[idx:idx+batch_size-1,column_src].apply(translator.translate, src='ja').apply(getattr, args=('text',))
        idx = idx+batch_size
        print(f"Current index: {idx} of {df[column_src].size}")
        time.sleep(10)

In [4]:
def extract_sakura_data(url,batch=False,pause_length=2):    
    #colspecs = [(0,5),(5,9),(9,16),(16,23),(23,30),(30,37),(37,44),(44,51),(51,58),(58,65),(65,72),(72,78),(78,86),(86,None)]

    # Be nice to the endpoint and wait a bit if we're doing batch processing of multiple function calls. 
    if batch:
        time.sleep(pause_length)
    bloom_req = requests.get(url)
    bloom_content = BeautifulSoup(bloom_req.content, 'lxml')

    print(f"Processing: {bloom_content.title.text}")
    # Convert the text table to a string IO so that pandas can read it in.
    #print(bloom_content.find(id='main').pre.text)
    bloom_string = StringIO(bloom_content.find(id='main').pre.text)

    #Find the first real line so we can dynmiacally determine the column spacings
    for line in bloom_string:
        if line.isspace() == False:
            break;

    #Find all of the character locations of each year
    year_iter = re.finditer("\d{4}", line)
    year_indices = [(m.start(0),m.end(0)+3) for m in year_iter]

    #Pick the ending character of the last year so we can add the last two columns.
    end_char = year_indices[-1][1]

    dynamic_colspecs = [(0,5),(5,9)]
    end_colspecs = [(end_char,end_char+8),(end_char+8,None)]

    # Put everything together in the same list
    dynamic_colspecs.extend(year_indices)
    dynamic_colspecs.extend(end_colspecs)

    # Reset the string stream so we can re-parse the entire thing.
    bloom_string.seek(0)
    
    
    if debug_print:
        print(bloom_string.getvalue())

    bloom_df = pd.read_fwf(bloom_string,header=2,colspecs=dynamic_colspecs,true_values=['*'])
    if debug_print:
        print(bloom_df.head())

    # Get rid of the extra headers that showed up for readability on a web page.
    bloom_df.columns = bloom_df.columns.str.strip()

    bloom_df[bloom_df.duplicated()]
    bloom_df = bloom_df.drop_duplicates()
    
    bloom_df.drop(bloom_df.loc[bloom_df['地点名'].isna()].index, inplace=True)
    bloom_df.drop(bloom_df[bloom_df['地点名'].str.contains('地点名')].index, inplace=True)

    #Parse the year columns into datetime format.
    for col in bloom_df:
        if str.isnumeric(col):
            # Account for the # entries (which are usually dates measured in december)
            if bloom_df[col].str.contains('#').any():
                last_year = bloom_df[col].str.contains('#',na=False)
                this_year = ~bloom_df[col].str.contains('#',na=False)
                
                bloom_df.loc[last_year,col] = bloom_df.loc[last_year,col] + f' {int(col)-1}'
                bloom_df.loc[this_year,col] = bloom_df.loc[this_year,col] + f' {col}'
                bloom_df[col] = bloom_df[col].str.replace("#","")
            else:
                bloom_df[col] = bloom_df[col] + f' {col}'
            bloom_df[col] = pd.to_datetime(bloom_df[col],errors='coerce',exact=False,format="%m %d %Y")
            
            #   Data Assertion: No dates should exist in the current year after October. If they exist, they should be in the previous year.
            #if bloom_df.loc[bloom_df[col] > pd.to_datetime(f'{col}-10-01'),col].any():
            #    bloom_df.loc[bloom_df[col] > pd.to_datetime(f'{col}-10-01'),col] = bloom_df.loc[bloom_df[col] > f'{col}-10-01',col] - pd.DateOffset(years=1)

    # Translate the non date columns
    bloom_df.rename(columns={bloom_df.columns[0]: 'Site Name',
                          'Unnamed: 1': 'Currently Being Observed',
                          bloom_df.columns[-2]: '30 Year Average 1981-2010',
                          bloom_df.columns[-1]: 'Notes' }, inplace=True)
    
    bloom_df.set_index('Site Name',inplace=True)
    
    #Fix stray #'s
    # Note: There's probably a better way of doing this, but I haven't found it yet.
    bloom_df['30 Year Average 1981-2010'] = bloom_df['30 Year Average 1981-2010'].str.replace("#","")
    
    # Fix the boolean data.
    # There were set values for True, blank got converted into NaN.
    bloom_df['Currently Being Observed'].fillna(False,inplace=True)
    
    with pd.option_context('display.max_rows', None):
        if debug_print:
            display(bloom_df)
            
    return bloom_df

#TODO: Come up with a more descriptive title that isn't super long.
def combine_sakura_data(bloom_dfs):
    
    concated = pd.concat(bloom_dfs,axis=1)
    concated.drop_duplicates(inplace=True)
    
    # Translations
    concated = concated.reset_index()
    concated['Site Name'] = concated['Site Name'] + ', 日本'
    batch_translation(concated,'Site Name',batch_size=10)
    concated['Site Name'] = concated['Site Name'].str.replace(', Japan', "")
    concated.set_index('Site Name',inplace=True)

    # Google translate doesn't properly translate the notes column, so I'm doing that manually.
    notes_dict = {'えぞやまざくら': 'Sargent cherry (Prunus sargentii)',
                  'ちしまざくら': 'Kurile Island Cherry (Cerasus nipponica var. kurilensis)',
                 'ひかんざくら': 'Taiwan cherry (Prunus campanulata)',
                  '（注）': 'Until 1994 Sargent Cherry, from 1995 to 2006 they were Yoshino Cherry.'}
    
    observed_col = concated['Currently Being Observed'].iloc[:,0]
    transposed = concated.T.drop_duplicates(keep='last')
    transposed.drop('Currently Being Observed',inplace=True)

    combined_blooms = transposed.T
    combined_blooms.insert(0,'Currently Being Observed',observed_col);

    combined_blooms.Notes = combined_blooms.Notes.map(notes_dict)
    
    return combined_blooms

In [5]:
debug_print = False
bloom_urls = ['https://www.data.jma.go.jp/sakura/data/sakura003_00.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura003_01.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura003_02.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura003_03.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura003_04.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura003_05.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura003_06.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura003_07.html']

bloom_dfs = [extract_sakura_data(x,batch=True) for x in bloom_urls]
bloom_start = combine_sakura_data(bloom_dfs)

Processing: 気象庁 | さくらの開花日(1953-1960年)
Processing: 気象庁 | さくらの開花日(1961-1970年)
Processing: 気象庁 | さくらの開花日(1971-1980年)
Processing: 気象庁 | さくらの開花日(1981-1990年)
Processing: 気象庁 | さくらの開花日(1991-2000年)
Processing: 気象庁 | さくらの開花日(2001-2010年)
Processing: 気象庁 | さくらの開花日(2011-2020年)
Processing: 気象庁 | さくらの開花日(2021-2023年)
Current index: 10 of 102
Current index: 20 of 102
Current index: 30 of 102
Current index: 40 of 102
Current index: 50 of 102
Current index: 60 of 102
Current index: 70 of 102
Current index: 80 of 102
Current index: 90 of 102
Current index: 100 of 102
Current index: 110 of 102


In [6]:
debug_print = False
full_bloom_urls = ['https://www.data.jma.go.jp/sakura/data/sakura004_00.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura004_01.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura004_02.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura004_03.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura004_04.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura004_05.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura004_06.html',
                     'https://www.data.jma.go.jp/sakura/data/sakura004_07.html']

full_bloom_dfs = [extract_sakura_data(x,batch=True) for x in full_bloom_urls]
full_bloom = combine_sakura_data(full_bloom_dfs)

Processing: 気象庁 | さくらの満開日(1953-1960年)
Processing: 気象庁 | さくらの満開日(1961-1970年)
Processing: 気象庁 | さくらの満開日(1971-1980年)
Processing: 気象庁 | さくらの満開日(1981-1990年)
Processing: 気象庁 | さくらの満開日(1991-2000年)
Processing: 気象庁 | さくらの満開日(2001-2010年)
Processing: 気象庁 | さくらの満開日(2011-2020年)
Processing: 気象庁 | さくらの満開日(2021-2023年)
Current index: 10 of 102
Current index: 20 of 102
Current index: 30 of 102
Current index: 40 of 102
Current index: 50 of 102
Current index: 60 of 102
Current index: 70 of 102
Current index: 80 of 102
Current index: 90 of 102
Current index: 100 of 102
Current index: 110 of 102


In [7]:
# Sanity checks
#    Full bloom is *ALWAYS* after the initial bloom. If it's not something somewhere is wrong.

date_cols = [col for col in bloom_start.columns if str.isnumeric(col)]
broken_dates = full_bloom[date_cols].fillna(pd.Timestamp('2024-01-01')) < bloom_start[date_cols].fillna(pd.Timestamp('1950-01-01'))

broken_dates.any().any()

False

In [10]:
# Translations Final Fixes
#bloom_start = bloom_start.rename(index={"Iriomotei sand": "Iriomote Island"},errors="raise")
#full_bloom = full_bloom.rename(index={"Iriomotei sand": "Iriomote Island"},errors="raise")

In [9]:
bloom_start.to_csv('sakura_first_bloom_dates.csv')
full_bloom.to_csv('sakura_full_bloom_dates.csv')

# Troubleshooting
This is my area for misc troubleshooting. Fully working code is above this.

In [14]:
df = bloom_dfs[0].reset_index()
idx = 0
batch_size = 10
column_src = "Site Name"



while idx < df[column_src].size:
    # Spawn a new translator session to see if that gets past the 429 code from Google.
    translator = Translator()
    translator.raise_Exception = True
    df.loc[idx:idx+batch_size,column_src] = df.loc[idx:idx+batch_size,column_src].apply(translator.translate, src='ja').apply(getattr, args=('text',))
    idx = idx+batch_size
    print(f"Current index: {idx} of {df[column_src].size}")
    time.sleep(10)

#df.loc[idx:idx+batch_size,column_src] = df.loc[idx:idx+batch_size,column_src].apply(translator.translate, src='ja').apply(getattr, args=('text',))
#print(f"Current index: {idx} of {df[column_src].size}")

Current index: 10 of 102
Current index: 20 of 102
Current index: 30 of 102
Current index: 40 of 102
Current index: 50 of 102


ReadTimeout: The read operation timed out

In [15]:
bloom_start = bloom_start.rename(index={"Iriomotei sand": "Iriomote Island"},errors="raise")

NameError: name 'bloom_start' is not defined

In [37]:
full_bloom

Unnamed: 0_level_0,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,1961,...,2013,2014,2015,2016,2017,2018,2019,2020,30 Year Average 1981-2010,Notes
Site Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wakkanai,True,1953-05-30,1954-05-27,1955-05-23,1956-05-14,1957-05-22,1958-05-25,1959-05-12,1960-05-24,1961-05-18,...,2013-05-28,2014-05-13,2015-05-06,2016-05-15,2017-05-11,2018-05-15,2019-05-09,2020-05-12,5 17,Sargent cherry (Prunus sargentii)
Rumoi,False,1953-05-13,1954-05-17,1955-05-16,1956-05-14,1957-05-14,1958-05-16,NaT,NaT,1961-05-13,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,-,Sargent cherry (Prunus sargentii)
Asahikawa,True,1953-05-16,1954-05-18,1955-05-14,1956-05-13,1957-05-14,1958-05-15,1959-05-13,1960-05-16,1961-05-12,...,2013-05-20,2014-05-04,2015-04-28,2016-05-04,2017-05-05,2018-05-01,2019-05-04,2020-05-04,5 7,Sargent cherry (Prunus sargentii)
Abashiri,True,1953-05-26,1954-05-19,1955-05-20,1956-05-13,NaT,NaT,1959-05-07,NaT,NaT,...,2013-05-27,2014-05-11,2015-05-03,2016-05-10,2017-05-07,2018-05-06,2019-05-07,2020-05-09,5 14,Sargent cherry (Prunus sargentii)
Sapporo,True,1953-05-14,1954-05-08,1955-05-16,1956-05-09,1957-05-13,NaT,1959-05-10,1960-05-09,1961-05-09,...,2013-05-17,2014-05-01,2015-04-26,2016-05-01,2017-05-03,2018-04-29,2019-04-29,2020-05-02,5 7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Kumejima,False,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,-,Taiwan cherry (Prunus campanulata)
Naha,True,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,2013-01-23,2014-01-27,2015-01-29,2016-02-12,2017-02-08,2018-01-30,2019-02-12,2020-02-03,2 4,Taiwan cherry (Prunus campanulata)
Nago,False,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,-,Taiwan cherry (Prunus campanulata)
Iriomote Island,False,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,-,Taiwan cherry (Prunus campanulata)


In [78]:
full_bloom[full_bloom[date_cols].fillna(pd.Timestamp('2020-01-02')) < bloom_start[date_cols].fillna(pd.Timestamp('2020-01-01'))]

Unnamed: 0_level_0,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,1961,...,2013,2014,2015,2016,2017,2018,2019,2020,30 Year Average 1981-2010,Notes
Site Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Wakkanai,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
Rumoi,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
Asahikawa,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
Abashiri,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
Sapporo,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Kumejima,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
Naha,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
Nago,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,
Iriomotei sand,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,,


In [20]:
debug_print = True
test_url = 'https://www.data.jma.go.jp/sakura/data/sakura004_00.html'

test_df = extract_sakura_data(test_url)

Processing: 気象庁 | さくらの満開日(1953-1960年)


地点名　     1953   1954   1955   1956   1957   1958   1959   1960   平年値   代替種目
            月 日  月 日  月 日  月 日  月 日  月 日  月 日  月 日   月 日

稚内     *   5 30   5 27   5 23   5 14   5 22   5 25   5 12   5 24    5 17    えぞやまざくら
留萌         5 13   5 17   5 16   5 14   5 14   5 16      -      -       -    えぞやまざくら
旭川     *   5 16   5 18   5 14   5 13   5 14   5 15   5 13   5 16    5  7    えぞやまざくら
網走     *   5 26   5 19   5 20   5 13      -      -   5  7      -    5 14    えぞやまざくら
札幌     *   5 14   5  8   5 16   5  9   5 13      -   5 10   5  9    5  7    
岩見沢       5 11   5  9   5 13   5  9      -   5 14      -   5 15       -    えぞやまざくら
帯広     *   5 19   5 12   5 14   5  9   5 14   5 16   5  5   5 15    5  7    えぞやまざくら
釧路     *      -   5 28   5 26      -   5 19   5 29   5 20   5 28    5 20    えぞやまざくら
根室            -      -      -      -      -      -      -   6  8       -    ちしまざくら
室蘭     *   5 17   5 13   5 23   5 13   5 14   5 17   4 30   5 14    5 11    

浦

Unnamed: 0_level_0,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,30 Year Average 1981-2010,Notes
Site Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
稚内,True,1953-05-30,1954-05-27,1955-05-23,1956-05-14,1957-05-22,1958-05-25,1959-05-12,1960-05-24,5 17,えぞやまざくら
留萌,,1953-05-13,1954-05-17,1955-05-16,1956-05-14,1957-05-14,1958-05-16,NaT,NaT,-,えぞやまざくら
旭川,True,1953-05-16,1954-05-18,1955-05-14,1956-05-13,1957-05-14,1958-05-15,1959-05-13,1960-05-16,5 7,えぞやまざくら
網走,True,1953-05-26,1954-05-19,1955-05-20,1956-05-13,NaT,NaT,1959-05-07,NaT,5 14,えぞやまざくら
札幌,True,1953-05-14,1954-05-08,1955-05-16,1956-05-09,1957-05-13,NaT,1959-05-10,1960-05-09,5 7,
岩見沢,,1953-05-11,1954-05-09,1955-05-13,1956-05-09,NaT,1958-05-14,NaT,1960-05-15,-,えぞやまざくら
帯広,True,1953-05-19,1954-05-12,1955-05-14,1956-05-09,1957-05-14,1958-05-16,1959-05-05,1960-05-15,5 7,えぞやまざくら
釧路,True,NaT,1954-05-28,1955-05-26,NaT,1957-05-19,1958-05-29,1959-05-20,1960-05-28,5 20,えぞやまざくら
根室,,NaT,NaT,NaT,NaT,NaT,NaT,NaT,1960-06-08,-,ちしまざくら
室蘭,True,1953-05-17,1954-05-13,1955-05-23,1956-05-13,1957-05-14,1958-05-17,1959-04-30,1960-05-14,5 11,


In [24]:
test_df['Currently Being Observed'].fillna(False,inplace=True)
test_df.head()

Unnamed: 0_level_0,Currently Being Observed,1953,1954,1955,1956,1957,1958,1959,1960,30 Year Average 1981-2010,Notes
Site Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
稚内,True,1953-05-30,1954-05-27,1955-05-23,1956-05-14,1957-05-22,1958-05-25,1959-05-12,1960-05-24,5 17,えぞやまざくら
留萌,False,1953-05-13,1954-05-17,1955-05-16,1956-05-14,1957-05-14,1958-05-16,NaT,NaT,-,えぞやまざくら
旭川,True,1953-05-16,1954-05-18,1955-05-14,1956-05-13,1957-05-14,1958-05-15,1959-05-13,1960-05-16,5 7,えぞやまざくら
網走,True,1953-05-26,1954-05-19,1955-05-20,1956-05-13,NaT,NaT,1959-05-07,NaT,5 14,えぞやまざくら
札幌,True,1953-05-14,1954-05-08,1955-05-16,1956-05-09,1957-05-13,NaT,1959-05-10,1960-05-09,5 7,


In [37]:
translator = Translator()

notes_map = pd.DataFrame(bloom_start['Notes'].unique(), columns=['jp'])

notes_map['en'] = notes_map['jp'].apply(translator.translate).apply(getattr, args=('text',))

In [38]:
notes_map

Unnamed: 0,jp,en
0,えぞやまざくら,Ezo Yamazakura
1,,
2,ちしまざくら,Chishima Sakura
3,（注）,(Note)
4,ひかんざくら,Hikanzakura


In [40]:
notes_dict = {'えぞやまざくら': 'Sargent cherry (Prunus sargentii)',
              'ちしまざくら': 'Kurile Island Cherry (Cerasus nipponica var. kurilensis)',
             'ひかんざくら': 'Taiwan cherry (Prunus campanulata)',
              '（注）': 'Note: Translation pending'}

In [30]:
concated['Currently Being Observed'].iloc[:,0]

Site Name
Wakkanai           True
Rumoi               NaN
Asahikawa          True
Abashiri           True
Sapporo            True
                   ... 
Kumejima            NaN
Naha               True
Nago                NaN
Iriomotei sand      NaN
Minamidaitojima    True
Name: Currently Being Observed, Length: 102, dtype: object

In [163]:
trans = Translator()
trans.translate('山形').src

'ja'

In [166]:
with pd.option_context('display.max_rows', None):
    display(transposed.T['30 Year Average 1981-2010'])

Site Name
Wakkanai           5 14
Rumoi                 -
Asahikawa          5  5
Abashiri           5 11
Sapporo            5  3
Iwamizawa             -
Obihiro            5  4
Kushiro            5 17
Nemuro                -
Muroran            5  6
Urakawa               -
Esashi                -
Hakodate           4 30
Kutchan               -
Monbetsu              -
Hiroo                 -
Shinjo                -
Aomori             4 24
Hachinohe             -
Akita              4 18
Morioka            4 21
Miyako                -
Sakata                -
Yamagata           4 15
Sendai             4 11
Fukushima          4  9
Shirakawa             -
Onahama               -
Wajima                -
Aikawa                -
Niigata            4  9
Kanazawa           4  4
Toyama             4  5
Nagano             4 13
Takada                -
Utsunomiya         4  1
Fukui              4  3
Takayama              -
Matsumoto             -
Maebashi           3 31
Kumagaya           3 29
Mito  

In [128]:
with pd.option_context('display.max_rows', None):
    print(concated.loc[['津']].T)

Site Name                                    津
Currently Being Observed                  True
1953                                       NaT
1954                       1954-03-03 00:00:00
1955                       1955-03-03 00:00:00
1956                                       NaT
1957                                       NaT
1958                                       NaT
1959                       1959-03-02 00:00:00
1960                       1960-03-02 00:00:00
30 Year Average 1981-2010             8    3 3
Notes                                        0
Currently Being Observed                  True
1961                                       NaT
1962                                       NaT
1963                                       NaT
1964                                       NaT
1965                       1965-04-01 00:00:00
1966                       1966-03-02 00:00:00
1967                       1967-03-03 00:00:00
1968                                       NaT
1969         

In [138]:
debug_print=True
process_sakura_url('https://www.data.jma.go.jp/sakura/data/sakura003_03.html')

Processing: 気象庁 | さくらの開花日(1981-1990年)


地点名　     1981   1982   1983   1984   1985   1986   1987   1988   1989   1990   平年値   代替種目
            月 日  月 日  月 日  月 日  月 日  月 日  月 日  月 日  月 日  月 日   月 日

稚内     *   5 11   5 16   5  6   5 24   5 13   5 13   5 16   5 15   5 16   5  6    5 14    えぞやまざくら
留萌         5  9   5 10   4 30   5 22   5 11   5  9   5  9   5  9   5  3   4 26       -    えぞやまざくら
旭川     *   5  5   5  5   4 27   5 14   5  3   5  6   5  9   5  7   5  3   4 28    5  5    えぞやまざくら
網走     *   5 14   5 11   5  3   5 24   5 10   5  8   5  9   5 12   5 16   5  7    5 11    えぞやまざくら
札幌     *   5  5   5  4   4 28   5 12   5  3   5  5   5  6   5  3   4 29   4 25    5  3    
岩見沢       5  7   5  5   4 27   5 17   5  4   5  5   5  8   5  5   4 28   4 26       -    えぞやまざくら
帯広     *   5  4   5  6   5  1   5 19   5  3   5  6   5  4   5  4   5  5   5  1    5  4    えぞやまざくら
釧路     *   5 22   5 17   5 12   5 30   5 18   5 16   5 17   5 18   5 18   5 10    5 17    えぞやまざくら
根室         5 24   5 19   5

Unnamed: 0_level_0,Currently Being Observed,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,30 Year Average 1981-2010,Notes
Site Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
稚内,True,1981-05-11,1982-05-16,1983-05-06,1984-05-24,1985-05-13,1986-05-13,1987-05-16,1988-05-15,1989-05-16,1990-05-06,5 14,えぞやまざくら
留萌,,1981-05-09,1982-05-10,1983-04-30,1984-05-22,1985-05-11,1986-05-09,1987-05-09,1988-05-09,1989-05-03,1990-04-26,-,えぞやまざくら
旭川,True,1981-05-05,1982-05-05,1983-04-27,1984-05-14,1985-05-03,1986-05-06,1987-05-09,1988-05-07,1989-05-03,1990-04-28,5 5,えぞやまざくら
網走,True,1981-05-14,1982-05-11,1983-05-03,1984-05-24,1985-05-10,1986-05-08,1987-05-09,1988-05-12,1989-05-16,1990-05-07,5 11,えぞやまざくら
札幌,True,1981-05-05,1982-05-04,1983-04-28,1984-05-12,1985-05-03,1986-05-05,1987-05-06,1988-05-03,1989-04-29,1990-04-25,5 3,
岩見沢,,1981-05-07,1982-05-05,1983-04-27,1984-05-17,1985-05-04,1986-05-05,1987-05-08,1988-05-05,1989-04-28,1990-04-26,-,えぞやまざくら
帯広,True,1981-05-04,1982-05-06,1983-05-01,1984-05-19,1985-05-03,1986-05-06,1987-05-04,1988-05-04,1989-05-05,1990-05-01,5 4,えぞやまざくら
釧路,True,1981-05-22,1982-05-17,1983-05-12,1984-05-30,1985-05-18,1986-05-16,1987-05-17,1988-05-18,1989-05-18,1990-05-10,5 17,えぞやまざくら
根室,,1981-05-24,1982-05-19,1983-05-15,1984-05-30,1985-05-19,1986-05-19,1987-05-15,1988-05-22,1989-05-20,1990-05-11,-,ちしまざくら
室蘭,True,1981-05-06,1982-05-10,1983-04-29,1984-05-25,1985-05-10,1986-05-08,1987-05-09,1988-05-06,1989-04-30,1990-04-26,5 6,


Unnamed: 0_level_0,Currently Being Observed,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,30 Year Average 1981-2010,Notes
Site Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
稚内,True,1981-05-11,1982-05-16,1983-05-06,1984-05-24,1985-05-13,1986-05-13,1987-05-16,1988-05-15,1989-05-16,1990-05-06,5 14,えぞやまざくら
留萌,,1981-05-09,1982-05-10,1983-04-30,1984-05-22,1985-05-11,1986-05-09,1987-05-09,1988-05-09,1989-05-03,1990-04-26,-,えぞやまざくら
旭川,True,1981-05-05,1982-05-05,1983-04-27,1984-05-14,1985-05-03,1986-05-06,1987-05-09,1988-05-07,1989-05-03,1990-04-28,5 5,えぞやまざくら
網走,True,1981-05-14,1982-05-11,1983-05-03,1984-05-24,1985-05-10,1986-05-08,1987-05-09,1988-05-12,1989-05-16,1990-05-07,5 11,えぞやまざくら
札幌,True,1981-05-05,1982-05-04,1983-04-28,1984-05-12,1985-05-03,1986-05-05,1987-05-06,1988-05-03,1989-04-29,1990-04-25,5 3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
久米島,,1981-01-27,1982-01-22,1983-01-30,1984-01-24,1985-01-23,1986-01-15,1987-01-08,1988-01-06,1988-12-31,1989-12-31,-,ひかんざくら
那覇,True,1981-01-24,1982-01-05,1983-01-16,1984-01-23,1985-01-23,1986-01-09,1987-01-23,1988-01-19,1989-01-09,1990-01-21,1 18,ひかんざくら
名護,,1981-01-09,1981-12-30,1983-01-05,1984-01-13,1985-01-22,1985-12-26,1987-01-09,1988-01-04,1988-12-31,1989-12-30,-,ひかんざくら
西表島,,1981-01-27,1982-01-10,1983-01-22,1984-01-17,1985-01-26,1986-01-23,1987-01-21,1988-01-15,1989-01-15,1990-01-18,-,ひかんざくら


In [136]:
import re
debug_print = True

bloom_req = requests.get('https://www.data.jma.go.jp/sakura/data/sakura003_03.html')
bloom_content = BeautifulSoup(bloom_req.content, 'lxml')

# Convert the text table to a string IO so that pandas can read it in.
#print(bloom_content.find(id='main').pre.text)
bloom_string = StringIO(bloom_content.find(id='main').pre.text)

String = '地点名　     1953   1954   1955   1956   1957   1958   1959   1960   平年値   代替種目'
String2 = '地点名　     2011   2012   2013   2014   2015   2016   2017   2018   2019   2020   平年値   代替種目'

#Find the first real line so we can dynmiacally determine the column spacings
for line in bloom_string:
    if line.isspace() == False:
        break;

#Find all of the character locations of each year
year_iter = re.finditer("\d{4}", line)
year_indices = [(m.start(0),m.end(0)+3) for m in year_iter]

#Pick the ending character of the last year so we can add the last two columns.
end_char = year_indices[-1][1]

dynamic_colspecs = [(0,5),(5,9)]
end_colspecs = [(end_char,end_char+8),(end_char+8,None)]

# Put everything together in the same list
dynamic_colspecs.extend(year_indices)
dynamic_colspecs.extend(end_colspecs)

print(dynamic_colspecs)
# Reset the string stream so we can re-parse the entire thing.
bloom_string.seek(0)

if debug_print:
    print(bloom_string.getvalue())

tst_names = ['地点名', 'Unnamed: 1', '1953', '1954', '1955','1956','1957','1958','1959','1960','平年値','代替種目']
unparsed_data = pd.read_fwf(bloom_string,header=0,colspecs=dynamic_colspecs,true_values=['*'])
with pd.option_context('display.max_rows', None):
    if debug_print:
        display(unparsed_data)

[(0, 5), (5, 9), (9, 16), (16, 23), (23, 30), (30, 37), (37, 44), (44, 51), (51, 58), (58, 65), (65, 72), (72, 79), (79, 87), (87, None)]


地点名　     1981   1982   1983   1984   1985   1986   1987   1988   1989   1990   平年値   代替種目
            月 日  月 日  月 日  月 日  月 日  月 日  月 日  月 日  月 日  月 日   月 日

稚内     *   5 11   5 16   5  6   5 24   5 13   5 13   5 16   5 15   5 16   5  6    5 14    えぞやまざくら
留萌         5  9   5 10   4 30   5 22   5 11   5  9   5  9   5  9   5  3   4 26       -    えぞやまざくら
旭川     *   5  5   5  5   4 27   5 14   5  3   5  6   5  9   5  7   5  3   4 28    5  5    えぞやまざくら
網走     *   5 14   5 11   5  3   5 24   5 10   5  8   5  9   5 12   5 16   5  7    5 11    えぞやまざくら
札幌     *   5  5   5  4   4 28   5 12   5  3   5  5   5  6   5  3   4 29   4 25    5  3    
岩見沢       5  7   5  5   4 27   5 17   5  4   5  5   5  8   5  5   4 28   4 26       -    えぞやまざくら
帯広     *   5  4   5  6   5  1   5 19   5  3   5  6   5  4   5  4   5  5   5  1    5  4    えぞやまざくら
釧路     *   5 22   5 17  

Unnamed: 0,地点名,Unnamed: 1,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,平年値 代替,種目
0,,,月 日,月 日 月,日 月 日,月 日,月 日 月,日 月 日,月 日 月,日 月,日,,,
1,稚内,True,5 11,5 16,5 6,5 24,5 13,5 13,5 16,5 15,5 16,5 6,5 14,えぞやまざくら
2,留萌,,5 9,5 10,4 30,5 22,5 11,5 9,5 9,5 9,5 3,4 26,-,えぞやまざくら
3,旭川,True,5 5,5 5,4 27,5 14,5 3,5 6,5 9,5 7,5 3,4 28,5 5,えぞやまざくら
4,網走,True,5 14,5 11,5 3,5 24,5 10,5 8,5 9,5 12,5 16,5 7,5 11,えぞやまざくら
5,札幌,True,5 5,5 4,4 28,5 12,5 3,5 5,5 6,5 3,4 29,4 25,5 3,
6,岩見沢,,5 7,5 5,4 27,5 17,5 4,5 5,5 8,5 5,4 28,4 26,-,えぞやまざくら
7,帯広,True,5 4,5 6,5 1,5 19,5 3,5 6,5 4,5 4,5 5,5 1,5 4,えぞやまざくら
8,釧路,True,5 22,5 17,5 12,5 30,5 18,5 16,5 17,5 18,5 18,5 10,5 17,えぞやまざくら
9,根室,,5 24,5 19,5 15,5 30,5 19,5 19,5 15,5 22,5 20,5 11,-,ちしまざくら


In [70]:
debug_print = False
testing = process_sakura_url('https://www.data.jma.go.jp/sakura/data/sakura003_03.html')

print(testing.iloc[1])

Processing: 気象庁 | さくらの開花日(1981-1990年)
Currently Being Observed                     NaN
1981                         1981-05-09 00:00:00
1982                         1982-05-10 00:00:00
1983                         1983-04-30 00:00:00
1984                         1984-05-22 00:00:00
1985                         1985-05-11 00:00:00
1986                         1986-05-09 00:00:00
1987                         1987-05-09 00:00:00
1988                         1988-05-09 00:00:00
1989                         1989-05-03 00:00:00
1990                         1990-04-26 00:00:00
30 Year Average 1981-2010                      -
Notes                                    えぞやまざくら
Name: 留萌, dtype: object


In [45]:
testing.loc[testing['1989'] > '1989-10-01','1989']

Series([], Name: 1989, dtype: datetime64[ns])

In [52]:
testing.loc[testing['1989'] < '1989-01-01','1989']

Timestamp('1988-12-29 16:00:00')

In [28]:
concated.T.loc[concated.T.duplicated()]

Site Name,稚内,留萌,旭川,網走,札幌,岩見沢,帯広,釧路,根室,室蘭,...,種子島,名瀬,与那国島,石垣島,宮古島,久米島,那覇,名護,西表島,南大東島
Currently Being Observed,True,,True,True,True,,True,True,,True,...,,True,,True,True,,True,,,True
Notes,えぞやまざくら,えぞやまざくら,えぞやまざくら,えぞやまざくら,,えぞやまざくら,えぞやまざくら,えぞやまざくら,ちしまざくら,,...,,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら
Currently Being Observed,True,,True,True,True,,True,True,,True,...,,True,,True,True,,True,,,True
Notes,えぞやまざくら,えぞやまざくら,えぞやまざくら,えぞやまざくら,,えぞやまざくら,えぞやまざくら,えぞやまざくら,ちしまざくら,,...,,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら
Currently Being Observed,True,,True,True,True,,True,True,,True,...,,True,,True,True,,True,,,True
Notes,えぞやまざくら,えぞやまざくら,えぞやまざくら,えぞやまざくら,,えぞやまざくら,えぞやまざくら,えぞやまざくら,ちしまざくら,,...,,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら
Currently Being Observed,True,,True,True,True,,True,True,,True,...,,True,,True,True,,True,,,True
Notes,えぞやまざくら,えぞやまざくら,えぞやまざくら,えぞやまざくら,,えぞやまざくら,えぞやまざくら,えぞやまざくら,ちしまざくら,,...,,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら
Currently Being Observed,True,,True,True,True,,True,True,,True,...,,True,,True,True,,True,,,True
Notes,えぞやまざくら,えぞやまざくら,えぞやまざくら,えぞやまざくら,,えぞやまざくら,えぞやまざくら,えぞやまざくら,ちしまざくら,,...,,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら,ひかんざくら


In [64]:
with pd.option_context('display.max_rows', None):
    display(concated.loc['名護','30 Year Average 1981-2010'])
    
concated.loc['名護','30 Year Average 1981-2010'].str.contains('#').any()

derp = '2019'

for entry in concated.loc['名護','30 Year Average 1981-2010']:
    if '#' in entry:
         print(entry)

30 Year Average 1981-2010           -
30 Year Average 1981-2010           -
30 Year Average 1981-2010    #      -
30 Year Average 1981-2010    #      -
30 Year Average 1981-2010           -
30 Year Average 1981-2010           -
30 Year Average 1981-2010           -
Name: 名護, dtype: object

#      -
#      -


In [32]:
untransposed = transposed.T

with pd.option_context('display.max_rows', None):
    display(untransposed['30 Year Average 1981-2010'])

Unnamed: 0_level_0,30 Year Average 1981-2010,30 Year Average 1981-2010,30 Year Average 1981-2010,30 Year Average 1981-2010,30 Year Average 1981-2010,30 Year Average 1981-2010
Site Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
稚内,5 14,5 14,5 14,5 14,5 14,5 14
留萌,-,-,-,-,-,-
旭川,5 5,5 5,5 5,5 5,5 5,5 5
網走,5 11,5 11,5 11,5 11,5 11,5 11
札幌,5 3,5 3,5 3,5 3,5 3,5 3
岩見沢,-,-,-,-,-,-
帯広,5 4,5 4,5 4,5 4,5 4,5 4
釧路,5 17,5 17,5 17,5 17,5 17,5 17
根室,-,-,-,-,-,-
室蘭,5 6,5 6,5 6,5 6,5 6,5 6


In [65]:
debug_print = True
#process_sakura_url('https://www.data.jma.go.jp/sakura/data/sakura003_01.html')

In [133]:
col = '1990'

has_stuff = unparsed_data[col].str.contains('#',na=False)
no_stuff = ~unparsed_data[col].str.contains('#',na=False)
#unparsed_data.loc[~unparsed_data[col].str.contains('#',na=False),col] = unparsed_data.loc[~unparsed_data[col].str.contains('#',na=False),col] + f' {int(col)-1}'

In [134]:
unparsed_data.loc[has_stuff,col] = unparsed_data.loc[has_stuff,col].str.strip('#') + f' {int(col)-1}'

In [141]:
unparsed_data.loc[has_stuff,'1989'].str.replace("#","")

106    12 31
108    12 31
Name: 1989, dtype: object

In [108]:
bloom_content.title.text

'気象庁 | さくらの開花日(1981-1990年)'