## Will concat most recent BOM monthly rainfall to SILO database
#### Reason: SILO rainfall for most recent month showing high variance from BOM observations

In [1]:
import pandas as pd

### BOM Rainfall for July

In [2]:
# July rainfall observations from BOM. Contains 207 SILO database stations
df_bom = pd.read_csv(r'C:\Users\rj71b\geo-projects\wheatbelt_rainfall_analyser\data\interim\bom_july.csv',
                    usecols = [0,3])

In [3]:
# add year and month colum to allow for concatenating
df_bom['year'] = 2019
df_bom['month'] = 7

In [4]:
# change column order to match historical rainfall dataframe
df_bom = df_bom[['station', 'year', 'month', 'rain']]

In [5]:
df_bom.head()

Unnamed: 0,station,year,month,rain
0,8002,2019,7,42.3
1,8005,2019,7,43.8
2,8008,2019,7,55.0
3,8013,2019,7,30.2
4,8016,2019,7,35.0


In [50]:
# filter df_bom to contain only stations in df_silo. May be able to do this when I concatenate the two dataframes
df_bom = df_bom[df_bom['station'].isin([x for x in df_silo.station.values])] 

In [51]:
df_bom.shape

(198, 4)

### SILO historical rainfall

In [39]:
df_silo = pd.read_csv(r'C:\Users\rj71b\geo-projects\wheatbelt_rainfall_analyser\data\external\wa_silo_weather_data.csv',
                     usecols = [0,3,7])

In [27]:
df_silo.head(6)

Unnamed: 0,date,rain,station
0,188902,0.0,8002
1,188903,0.8,8002
2,188904,36.2,8002
3,188905,76.4,8002
4,188906,68.9,8002
5,188907,31.2,8002


In [57]:
len(df_silo.station.unique())

372

In [58]:
# filter to get dataframe with metadata for WA Bom stations
df_silo = df_silo[df_silo['station'].isin([x for x in df_bom.station.values])] 

In [59]:
# make string version of original column
df_silo['date'] = df_silo['date'].astype(str)

In [60]:
# make the new columns using string indexing
df_silo['year'] = df_silo['date'].str[0:4].astype('int64')
df_silo['month'] = df_silo['date'].str[4:6].astype('int64')
# get rid of the extra variable (if you want)
df_silo.drop('date', axis=1, inplace=True)

In [61]:
# change column order
df_silo = df_silo[['station', 'year', 'month', 'rain']]

In [62]:
df_silo.tail()

Unnamed: 0,station,year,month,rain
582547,12320,2019,3,63.6
582548,12320,2019,4,8.0
582549,12320,2019,5,2.2
582550,12320,2019,6,96.9
582551,12320,2019,7,36.0


In [63]:
df_silo_drop_july = df_silo[(df_silo['year'] != 2019) | (df_silo['month'] != 7) ]

In [64]:
# drop july observations in readiness to concat BOM observations
df_silo_drop_july.shape

(309870, 4)

## Concat BOM July to main  Silo data

In [65]:
concat = pd.concat([df_silo_drop_july, df_bom])

In [66]:
concat.set_index('station', inplace = True)

In [67]:
# Below code sorts df after monthly rainfall has been concatenated to main datafram

concat_sort = concat.sort_values(['station', 'year', 'month'])

In [68]:
concat_sort.shape

(310068, 3)

## Create a groupby object. Can then get specific station and compare to BOM

#### The longer a station has been opened the more closely alligned are means, median etc
#### Interpolation estimates have less of an effect

In [69]:
gb = concat.groupby(['station'])

In [71]:
gb.get_group(8008).groupby(['month'])['rain'].agg(['mean','min','median', 'max'])

Unnamed: 0_level_0,mean,min,median,max
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,12.427692,0.0,2.45,125.8
2,14.957252,0.0,5.4,152.9
3,18.408397,0.0,8.4,176.1
4,21.338168,0.0,15.8,100.5
5,56.205344,0.5,50.5,156.9
6,80.649618,12.0,78.5,236.1
7,80.89542,13.6,76.9,229.0
8,61.200769,6.6,59.35,153.7
9,37.838462,2.3,34.2,108.5
10,24.237692,0.4,18.6,102.1


### Send concatenated dataframe to data/interim folder to use in percentile analysis.
### See if using most recent monthly rainfall from BOM gives a better result

In [72]:
concat_sort.to_csv(r'C:\Users\rj71b\geo-projects\wheatbelt_rainfall_analyser\data\interim\silo_bom_july_concat.csv')

In [73]:
concat_sort.index.unique()

Int64Index([ 8002,  8005,  8008,  8013,  8016,  8025,  8028,  8037,  8044,
             8050,
            ...
            10917, 12011, 12026, 12044, 12064, 12071, 12083, 12201, 12223,
            12320],
           dtype='int64', name='station', length=198)