# Extract all Cloud Amount - Annual Mean Monthly 3pm readings

An archive containing high quality rainfall data from the BOM has been downloaded from here:

http://www.bom.gov.au/climate/change/hqsites/about-hq-site-data.shtml 

There is a file with the list of stations and then a zip file for each station containing that station’s data.

Create a notebook that will extract all the folders, and extract all the files, filters just the latest data according the minYearToLoad and Saves the final file into the ./data_files/ folder for future processing


In [4]:
import pandas as pd
import os
from pathlib import Path
from unlzw import unlzw

# For testing, set this a small number to just do a few files. Otherwise, set to 9999 to do unlimited (basically)
maxFiles = 9999
# When extracting the rainfall data per station, set to what year you what to go back to
minYearToLoad = 2014

sourceFolder = "./data_files_raw/Cloud_Amount_3pm/extracted/"
workingSubFolder = "./data_files_raw/Cloud_Amount_3pm/working/"
dataFolder = "./data_files/"
stationsFile = "HQMC_stations"

if not os.path.exists(workingSubFolder):
  Path(workingSubFolder).mkdir(parents=True,exist_ok=True)      


if not os.path.exists(dataFolder):
  Path(dataFolder).mkdir(parents=True,exist_ok=True)        

First, load the stations file into a dataframe. Since the delimiters are only spaces, but there are spaces in words, need to do it a bit more manually  

In [5]:
lstStationId = []
lstLatitude = []
lstLongitude = []
lstElevationMetres = []
lstStationName = []

stationName = ""
with open(sourceFolder + stationsFile, "r") as station_file:
  for line in station_file:
    line = line.strip()
    tokens = line.split(" ")

    for i in range(4, len(tokens)):    
      if i == 4:
        stationName = tokens[4]
      else:
        stationName += " " + tokens[i]    

    lstStationId.append(tokens[0])
    lstLatitude.append(float(tokens[1]))
    lstLongitude.append(float(tokens[2]))
    lstElevationMetres.append(float(tokens[3]))    
    lstStationName.append(stationName)

In [6]:
dfStations = pd.DataFrame(
  { 
    "StationId" : lstStationId,
    "Latitude" : lstLatitude,
    "Longitude" : lstLongitude,
    "ElevationMs" : lstElevationMetres, 
    "StationName" : lstStationName
  }  
)

del lstStationId
del lstLatitude
del lstLongitude
del lstElevationMetres
del lstStationName

dfStations.head(10)


Unnamed: 0,StationId,Latitude,Longitude,ElevationMs,StationName
0,1013,-15.49,128.12,9999.9,WYNDHAM POST OFFICE
1,2012,-18.23,127.66,9999.9,HALLS CREEK AIRPORT
2,2032,-17.02,128.22,9999.9,WARMUN
3,3003,-17.95,122.23,9999.9,BROOME AIRPORT
4,4032,-20.37,118.63,9999.9,PORT HEDLAND AIRPORT
5,4035,-20.78,117.15,9999.9,ROEBOURNE
6,5007,-22.24,114.1,9999.9,LEARMONTH AP
7,5016,-21.64,115.11,9999.9,ONSLOW
8,5026,-22.24,118.34,9999.9,WITTENOOM
9,6011,-24.88,113.67,9999.9,CARNARVON AIRPORT


Loop through each file in extracted. All files that end in .Z, unzip to a subfolder in the working folder

In [7]:
# For testing, set this a small number to just do a few files. Otherwise, set to 9999 to do unlimited (basically)
stepper = 0

lstCloudStationId = []
lstCloudFromYear = []
lstCloudFromMonth = []
lstCloudFromDay = []
lstCloudToYear = []
lstCloudToMonth = []
lstCloudToDay = []
lstCloudValue = []

for filename in os.listdir(sourceFolder):
  if os.path.isfile(sourceFolder+filename) and filename.lower().endswith(".z"):
    if (stepper > maxFiles):
      break
    
    # This is one of the zip files, extract it to a subfolder in the working folder, named after the file
    if not os.path.exists(workingSubFolder + filename):
      Path(workingSubFolder + filename).mkdir(parents=True,exist_ok=True) 

    fh = open(sourceFolder+filename, 'rb')
    compressed_data = fh.read()
    uncompressed_data = unlzw(compressed_data)    
    fh.close()

    fileContents = uncompressed_data.decode("utf-8")
    
    fileLines = fileContents.split("\n")

    if len(fileLines) > 0:
      # The first line is some header, just extract the stationid from it

      firstLine = fileLines[0]
      firstLine = firstLine.replace("CLD3PM", "").strip()
      stationId = firstLine[0:6]

      for i in range(1, len(fileLines)):        
      #for i in range(1, 3):      
        fileLines[i] = fileLines[i].replace("     ", " ")
        tokens = fileLines[i].split(" ")

        if len(tokens) == 3:

          year = int(tokens[0][0:4])
        
          if year >= minYearToLoad:
            month = int(tokens[0][5:6])
            day = int(tokens[0][7:8])

            yearTo = int(tokens[1][0:4])
            monthTo = int(tokens[1][5:6])
            dayTo = int(tokens[1][7:8])

            value = float(tokens[2])

            lstCloudStationId.append(stationId)
            lstCloudFromYear.append(year)
            lstCloudFromMonth.append(month)
            lstCloudFromDay.append(day)
            lstCloudToYear.append(yearTo)
            lstCloudToMonth.append(monthTo)
            lstCloudToDay.append(dayTo)
            lstCloudValue.append(value)

    stepper += 1



In [8]:
dfEvap = pd.DataFrame(
  { 
    "StationId" : lstCloudStationId,
    "Year" : lstCloudFromYear,
    "Month" : lstCloudFromMonth,
    "Day" : lstCloudFromDay, 
    "YearTo" : lstCloudToYear,
    "MonthTo" : lstCloudToMonth,
    "DayTo" : lstCloudToDay, 
    "Value" : lstCloudValue
  }  
)
dfEvap.head(10)

Unnamed: 0,StationId,Year,Month,Day,YearTo,MonthTo,DayTo,Value
0,1013,2014,1,1,2014,2,1,2.9
1,1013,2014,1,1,2014,1,1,4.5
2,1013,2014,2,1,2014,2,8,5.5
3,1013,2014,3,1,2014,3,1,3.2
4,1013,2014,4,1,2014,4,0,3.1
5,1013,2014,5,1,2014,5,1,2.9
6,1013,2014,6,1,2014,6,0,0.9
7,1013,2014,7,1,2014,7,1,1.1
8,1013,2014,8,1,2014,8,1,0.2
9,1013,2014,9,1,2014,9,0,0.9


Join on the Stations

In [9]:
dfMerged = pd.merge(dfStations, dfEvap, on="StationId")

In [10]:
dfMerged.to_csv(dataFolder + "annual_monthly_mean_cloud_amount_3pm_" + str(minYearToLoad) + ".csv", index=False)