# Extract all Daily Rainfall

An archive containing high quality rainfall data from the BOM has been downloaded from here:

http://www.bom.gov.au/climate/change/hqsites/about-hq-site-data.shtml 

There is a file with the list of stations and then a zip file for each station containing that station’s data.

Create a notebook that will extract all the folders, and extract all the files, filters just the latest data according the minYearToLoad and Saves the final file into the ./data_files/ folder for future processing


In [41]:
import pandas as pd
import os
from pathlib import Path
from unlzw import unlzw

# For testing, set this a small number to just do a few files. Otherwise, set to 9999 to do unlimited (basically)
maxFiles = 9999
# When extracting the rainfall data per station, set to what year you what to go back to
minYearToLoad = 2014

sourceFolder = "./data_files_raw/Daily_Rainfall/extracted/"
workingSubFolder = "./data_files_raw/Daily_Rainfall/working/"
dataFolder = "./data_files/"
stationsFile = "HQDR_stations.txt"

if not os.path.exists(workingSubFolder):
  Path(workingSubFolder).mkdir(parents=True,exist_ok=True)      


if not os.path.exists(dataFolder):
  Path(dataFolder).mkdir(parents=True,exist_ok=True)        

First, load the stations file into a dataframe. Since the delimiters are only spaces, but there are spaces in words, need to do it a bit more manually  

In [42]:
lstStationId = []
lstLatitude = []
lstLongitude = []
lstElevationMetres = []
lstStationName = []

stationName = ""
with open(sourceFolder + stationsFile, "r") as station_file:
  for line in station_file:
    line = line.strip()
    tokens = line.split(" ")

    for i in range(4, len(tokens)):    
      if i == 4:
        stationName = tokens[4]
      else:
        stationName += " " + tokens[i]    

    lstStationId.append(tokens[0])
    lstLatitude.append(float(tokens[1]))
    lstLongitude.append(float(tokens[2]))
    lstElevationMetres.append(float(tokens[3]))    
    lstStationName.append(stationName)

In [43]:
dfStations = pd.DataFrame(
  { 
    "StationId" : lstStationId,
    "Latitude" : lstLatitude,
    "Longitude" : lstLongitude,
    "ElevationMs" : lstElevationMetres, 
    "StationName" : lstStationName
  }  
)

del lstStationId
del lstLatitude
del lstLongitude
del lstElevationMetres
del lstStationName

dfStations.head(10)


Unnamed: 0,StationId,Latitude,Longitude,ElevationMs,StationName
0,4035,-20.78,117.15,12.0,ROEBOURNE
1,5008,-21.19,115.98,11.0,MARDIE
2,6055,-27.75,115.83,300.0,WOOLGORONG
3,7007,-26.98,116.54,300.0,BOOLARDY
4,7057,-28.06,117.84,426.0,MOUNT MAGNET
5,7095,-28.23,117.65,400.0,YOWERAGABBIE
6,8066,-30.7,117.06,310.0,KOKARDINE
7,8079,-29.02,115.62,260.0,MANARRA
8,8088,-29.19,115.44,153.0,MINGENEW POST OFFICE
9,8106,-29.37,116.4,280.0,PERANGERY


Loop through each file in extracted. All files that end in .Z, unzip to a subfolder in the working folder

In [44]:
# For testing, set this a small number to just do a few files. Otherwise, set to 9999 to do unlimited (basically)
stepper = 0

lstRainStationId = []
lstRainYear = []
lstRainMonth = []
lstRainDay = []
lstRainValue = []

for filename in os.listdir(sourceFolder):
  if os.path.isfile(sourceFolder+filename) and filename.lower().endswith(".z"):
    if (stepper > maxFiles):
      break
    
    # This is one of the zip files, extract it to a subfolder in the working folder, named after the file
    if not os.path.exists(workingSubFolder + filename):
      Path(workingSubFolder + filename).mkdir(parents=True,exist_ok=True) 

    fh = open(sourceFolder+filename, 'rb')
    compressed_data = fh.read()
    uncompressed_data = unlzw(compressed_data)    
    fh.close()

    fileContents = uncompressed_data.decode("utf-8")
    
    fileLines = fileContents.split("\n")

    if len(fileLines) > 0:
      # The first line is some header, just extract the stationid from it

      firstLine = fileLines[0]
      firstLine = firstLine.replace("PRCP", "").strip()
      stationId = firstLine[0:6]

      for i in range(1, len(fileLines)):        
      #for i in range(1, 3):      
        fileLines[i] = fileLines[i].replace("     ", " ")
        tokens = fileLines[i].split(" ")

        if len(tokens) == 2:

          year = int(tokens[0][0:4])
        
          if year >= minYearToLoad:
            month = int(tokens[0][5:6])
            day = int(tokens[0][7:8])
            value = float(tokens[1])

            lstRainStationId.append(stationId)
            lstRainYear.append(year)
            lstRainMonth.append(month)
            lstRainDay.append(day)
            lstRainValue.append(value)

    stepper += 1



In [45]:
dfRainfall = pd.DataFrame(
  { 
    "StationId" : lstRainStationId,
    "Year" : lstRainYear,
    "Month" : lstRainMonth,
    "Day" : lstRainDay, 
    "Value" : lstRainValue
  }  
)
dfRainfall.head(10)

Unnamed: 0,StationId,Year,Month,Day,Value
0,4035,2014,1,2,0.0
1,4035,2014,1,3,0.0
2,4035,2014,1,4,0.0
3,4035,2014,1,5,0.0
4,4035,2014,1,6,0.0
5,4035,2014,1,7,0.0
6,4035,2014,1,8,0.0
7,4035,2014,1,9,0.0
8,4035,2014,1,0,0.0
9,4035,2014,1,1,0.0


Join on the Stations

In [46]:
dfMerged = pd.merge(dfStations, dfRainfall, on="StationId")

In [47]:
dfMerged.to_csv(dataFolder + "daily_rainfall_" + str(minYearToLoad) + ".csv", index=False)