## Read in BOM monthly rainfall text files and convert to Pandas dataframe

In [1]:
import pandas as pd
import requests
from io import StringIO
from pathlib import Path

In [2]:
# wa_silo_stations contains stations only in the SILO database. 116 fewer than wa_bom_stations
wa_silo_stations = ['9909', '9519', '9803', '9509', '9508', '9842', '9771', '9877', '9628', '9994', '9738', '9556', '9504', '9617', '9690', '9552', '9666', '9587', '9517', '9616', '9573', '9530', '9585', '9590', '9908', '9592', '9968', '9611', '9805', '9904', '9619', '9871', '9215', '9225', '9007', '10138', '9192', '9193', '9789', '9584', '9579', '9827', '9804', '10878', '9822', '9772', '9542', '9631', '12028', '9557', '9961', '12077', '12312', '12223', '10895', '9813', '12044', '10811', '9626', '10633', '12114', '12071', '12198', '9739', '9922', '8050', '8044', '8057', '8276', '8273', '8013', '8157', '8025', '8037', '8238', '8077', '8067', '8072', '8078', '8079', '8088', '8296', '8107', '8007', '8060', '8121', '8264', '8143', '8004', '8240', '8200', '8288', '8052', '8168', '8065', '8251', '8075', '8095', '8096', '8028', '8100', '8104', '8113', '8116', '8294', '8147', '9599', '9520', '9754', '9678', '9633', '9930', '9615', '9848', '9609', '10502', '10519', '9654', '10729', '10792', '10558', '10905', '10707', '10699', '10619', '10541', '10622', '9594', '10627', '10595', '10694', '9835', '10508', '10725', '10831', '10566', '10916', '10700', '10659', '10520', '10525', '10530', '10531', '10537', '10866', '10543', '9635', '9843', '10582', '10797', '10635', '10643', '10893', '9506', '9752', '9561', '9581', '9875', '9591', '9515', '9511', '9964', '9661', '9607', '9621', '9625', '9112', '9024', '9040', '10052', '10002', '10012', '10044', '10294', '10286', '10040', '10041', '10163', '10073', '10104', '10628', '10257', '10121', '10122', '10143', '10145', '10036', '10000', '8002', '10155', '10032', '10039', '10042', '10045', '10058', '10061', '8066', '8254', '10076', '10077', '10097', '8137', '10140', '9037', '9144', '9006', '9014', '9018', '9178', '9131', '9210', '9114', '10503', '10507', '10006', '10118', '10016', '10019', '10034', '10053', '10151', '10119', '10298', '10092', '10095', '10702', '10124', '10612', '10108', '10904', '10123', '10083', '8022', '8005', '8008', '10009', '10156', '8297', '8014', '10084', '8016', '10026', '8061', '8064', '10070', '8085', '8087', '9033', '8151', '8130', '8139', '10007', '10158', '12026', '10011', '12007', '12011', '10149', '10055', '10192', '10264', '10082', '12056', '10102', '12064', '10112', '12101', '12320', '10126', '12079', '10030', '12083', '10135', '10137', '10136', '12201', '10244', '10150', '10152', '10111', '10125', '10134', '10515', '10120', '10115', '10634', '10311', '10524', '10527', '10534', '10536', '10823', '10564', '10571', '10626', '10917', '10894', '10696', '10671', '10644', '10513', '10518', '10546', '10872', '10560', '10603', '10565', '10568', '10705', '10581', '10665', '10584', '10606', '10670', '10911', '10889', '10611', '10692', '10625', '10638', '10636', '10654', '10662', '10614', '10505', '10704', '10510', '10542', '9668', '10641', '10793', '9914', '10647', '10655', '10658']
len(wa_silo_stations)

335

In [204]:
def download_bom_monthly():
    """fn: downloads BOM monthly rainfall data from website and returns a tuple containing a 
    pandas dataframe with monthly rainfall totals and a string with month and year of observations"""
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}
    url="http://www.bom.gov.au/web03/ncc/www/awap/rainfall/totals/month/station.list"
    s=requests.get(url, headers= headers).text
    df = pd.read_csv(StringIO(s), sep='[\t]', engine='python',skiprows = 2, header = None)
    return df, s[0:6]

    

In [168]:
def clean_bom_df(df):
    """fn: Cleans and reformats pandas dataframe downloaded from website"""
    df_rain = df.iloc[1:] #slice first row which contains month and year of observations
    df_split = df_rain[0].str.split(expand=True) 
    cols = df_split.columns
    df_split[cols] = df_split[cols].apply(pd.to_numeric, errors='coerce') # Converting selected columns to numeric dtypes:
    df_merge = df_split.merge(pd.DataFrame(df_rain[1]) ,left_index=True, right_index = True)
    df_merge.columns = ['station', 'lat', 'lon', 'rain', 'name']  # rename columns
    return df_merge

In [210]:
# returns reformatted rainfall data as a pandas dataframe
clean_bom_df(download_bom_monthly()[0]).head()

Unnamed: 0,station,lat,lon,rain,name
1,1007,-13.75,126.15,0.2,TROUGHTON ISLAND
2,1010,-14.79,126.5,0.0,THEDA
3,1018,-16.42,126.1,0.0,MOUNT ELIZABETH
4,1019,-14.3,126.65,0.0,KALUMBURU
5,1020,-14.09,126.39,0.4,TRUSCOTT


In [212]:
# returns data as a string. Will use in creating file name
date = download_bom_monthly()[1]

In [213]:
date

'201907'

In [214]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'}

url="http://www.bom.gov.au/web03/ncc/www/awap/rainfall/totals/month/station.list"
s=requests.get(url, headers= headers).text

df=pd.read_csv(StringIO(s), sep='[\t]', engine='python',skiprows = 2, header = None)


In [215]:
df.head()

Unnamed: 0,0,1
0,001006 -15.510 128.150 0.0,WYNDHAM AERO
1,001007 -13.750 126.150 0.2,TROUGHTON ISLAND
2,001010 -14.790 126.500 0.0,THEDA
3,001018 -16.420 126.100 0.0,MOUNT ELIZABETH
4,001019 -14.300 126.650 0.0,KALUMBURU


In [216]:
# split first column into 4
df_split = df[0].str.split(expand=True)

In [218]:
df_split.head()

Unnamed: 0,0,1,2,3
0,1006,-15.51,128.15,0.0
1,1007,-13.75,126.15,0.2
2,1010,-14.79,126.5,0.0
3,1018,-16.42,126.1,0.0
4,1019,-14.3,126.65,0.0


In [144]:
# Converting selected columns to numeric dtypes:

cols = df_split.columns

df_split[cols] = df_split[cols].apply(pd.to_numeric, errors='coerce')

In [145]:
# merge df's back into one
df_merge = df_split.merge(pd.DataFrame(df[1]) ,left_index=True, right_index = True)

In [146]:
df_merge.head()

Unnamed: 0,0,1_x,2,3,1_y
0,1006,-15.51,128.15,0.0,WYNDHAM AERO
1,1007,-13.75,126.15,0.2,TROUGHTON ISLAND
2,1010,-14.79,126.5,0.0,THEDA
3,1018,-16.42,126.1,0.0,MOUNT ELIZABETH
4,1019,-14.3,126.65,0.0,KALUMBURU


In [147]:
# rename columns
df_merge.columns = ['station', 'lat', 'lon', 'rain', 'name']

In [148]:
# filter BOM monthly rainfall list to return stations only in SILO database
df_bom_july = df_merge[df_merge['station'].isin([int(x) for x in wa_silo_stations])]

In [149]:
df_bom_july.head()

Unnamed: 0,station,lat,lon,rain,name
121,8002,-30.6,116.77,42.3,BALLIDU
122,8005,-30.73,116.02,43.8,BARBERTON
123,8008,-30.58,116.14,55.0,BERKSHIRE VALLEY
124,8013,-29.33,116.14,30.2,BOWGADA
125,8014,-30.22,116.78,33.8,DALWALLINU NORTH


#### Sends Bom monthly rainfall data to data\interim folder.
#### May lookm at sending to AWS bucket

In [224]:
df_bom_july.to_csv(dirname / filename, index = False)

### Experimenting with pathlib module for better file path handling

In [44]:
current_dir = Path.cwd()

In [45]:
home_dir = Path.home()

In [46]:
home_dir.joinpath('geo-projects')

WindowsPath('C:/Users/rj71b/geo-projects')

In [47]:
current_dir.parents[0]

WindowsPath('C:/Users/rj71b/geo-projects/wheatbelt_rainfall_analyser')

In [97]:
# directory path where BOM monthy rainfall data will be stored
dirname = Path(r'C:\Users\rj71b\geo-projects\wheatbelt_rainfall_analyser\data\interim')


In [98]:
dirname

WindowsPath('C:/Users/rj71b/geo-projects/wheatbelt_rainfall_analyser/data/interim')

In [219]:
# get month and year date of BOM rainfall data
# returns data as a string. Will use in creating file name
date = download_bom_monthly()[1]

In [220]:
date

'201907'

In [221]:
filename = 'bom_'+ date + '.csv'

In [222]:
filename

'bom_201907.csv'

In [223]:
dirname / filename

WindowsPath('C:/Users/rj71b/geo-projects/wheatbelt_rainfall_analyser/data/interim/bom_201907.csv')