# Extract all Daily Max Temperature

An archive containing high quality rainfall data from the BOM has been downloaded from here:

http://www.bom.gov.au/climate/change/hqsites/about-hq-site-data.shtml 

There is a file in ACORN_SAT_daily, took the v2.2 tmax file. Inside there is a list of xlsx files for each station. Separately, found the list of stations with details

Create a notebook that will load all the files into the working folder. The next version will try to load all the data, join with the stations, filters to just the latest data according to the minYearToLoad then saves the final file into the ./data_files/ folder for future processing


In [58]:
import pandas as pd
import os
from pathlib import Path

from utilities import data_basic_utility as databasic

# For testing, set this a small number to just do a few files. Otherwise, set to 9999 to do unlimited (basically)
maxFiles = 9999
# When extracting the rainfall data per station, set to what year you what to go back to
minYearToLoad = 2014

sourceFolder = "./data_files_raw/Daily_Temp_Max_v2_2/extracted/"
dataFolder = "./data_files/"
stationsFile = "./data_files_raw/acorn_sat_sites_scrape.csv"

# Create working and output folders if they don't exist 

if not os.path.exists(dataFolder):
  Path(dataFolder).mkdir(parents=True,exist_ok=True)        

First, load the stations file into a dataframe. 
Convert the Station Number to a string and right pad out with 0 to six chars

In [59]:
dfStations = pd.read_csv(stationsFile)
dfStations["stn_num"] = dfStations.apply(lambda x: databasic.padStringToLength(str(x["stn_num"]), "0", 6), axis=1)
print(dfStations.info())
dfStations.head(10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   stn_num       112 non-null    object 
 1   stn_name      112 non-null    object 
 2   lat           112 non-null    float64
 3   lon           112 non-null    float64
 4   elevation     112 non-null    int64  
 5   start         112 non-null    int64  
 6   sitesDB_name  112 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 6.2+ KB
None


Unnamed: 0,stn_num,stn_name,lat,lon,elevation,start,sitesDB_name
0,1019,Kalumburu,-14.3,126.65,23,1941,KALUMBURU
1,2079,Halls Creek,-18.23,127.67,409,1910,HALLS CREEK
2,3003,Broome,-17.95,122.24,7,1910,BROOME
3,4032,Port Hedland,-20.37,118.63,6,1912,PORT HEDLAND
4,4106,Marble Bar,-21.18,119.75,182,1910,MARBLE BAR
5,5007,Learmonth,-22.24,114.1,5,1975,LEARMONTH
6,5098,Karijini North (Wittenoom),-22.3,118.45,474,1951,WITTENOOM
7,6011,Carnarvon,-24.89,113.67,4,1910,CARNARVON
8,7045,Meekatharra,-26.61,118.54,517,1926,MEEKATHARRA
9,8296,Morawa,-29.2,116.02,271,1925,MORAWA


Reformat the table slightly

In [60]:
dfStations = dfStations.rename(columns={"stn_num":"StationId", "lat":"Latitude", "lon":"Longitude", "elevation":"ElevationMs", "sitesDB_name": "StationName"})
del dfStations["stn_name"]
del dfStations["start"]
dfStations.head()

Unnamed: 0,StationId,Latitude,Longitude,ElevationMs,StationName
0,1019,-14.3,126.65,23,KALUMBURU
1,2079,-18.23,127.67,409,HALLS CREEK
2,3003,-17.95,122.24,7,BROOME
3,4032,-20.37,118.63,6,PORT HEDLAND
4,4106,-21.18,119.75,182,MARBLE BAR


Examine the first file

In [61]:
dfTempsFile = pd.read_csv(sourceFolder+"tmax.001019.daily.csv", dtype={ "site number": "str" }, parse_dates=["date"])

# Get the site number and name and remove that first info row
stationId = databasic.padStringToLength(dfTempsFile.loc[0, "site number"], "0", 6)
stationName = dfTempsFile.loc[0, "site name"]
print(stationId)
print(stationName)

dfTempsFile = dfTempsFile[dfTempsFile["date"].notna()]

# Split the date column into Year Month Day
dfTempsFile["Year"] = dfTempsFile["date"].dt.year
dfTempsFile["Month"] = dfTempsFile["date"].dt.month
dfTempsFile["Day"] = dfTempsFile["date"].dt.day
dfTempsFile["site number"] = stationId
dfTempsFile["site name"] = stationName

del dfTempsFile["date"]

print(dfTempsFile.info())
dfTempsFile.head()

001019
KALUMBURU
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29158 entries, 1 to 29158
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   maximum temperature (degC)  28378 non-null  float64
 1   site number                 29158 non-null  object 
 2   site name                   29158 non-null  object 
 3   Year                        29158 non-null  int64  
 4   Month                       29158 non-null  int64  
 5   Day                         29158 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 1.6+ MB
None


Unnamed: 0,maximum temperature (degC),site number,site name,Year,Month,Day
1,31.3,1019,KALUMBURU,1941,9,1
2,31.3,1019,KALUMBURU,1941,9,2
3,30.8,1019,KALUMBURU,1941,9,3
4,39.1,1019,KALUMBURU,1941,9,4
5,32.4,1019,KALUMBURU,1941,9,5


Now loop through all the files in the extracted folder, each file is the data for a station

In [62]:
# For testing, set this a small number to just do a few files. Otherwise, set to 9999 to do unlimited (basically)
stepper = 0

for filename in os.listdir(sourceFolder):
  if os.path.isfile(sourceFolder+filename) and filename.lower().endswith(".csv"):
    if (stepper > maxFiles):
      break
    
    dfTempsFile = pd.read_csv(sourceFolder+filename, dtype={ "site number": "str" }, parse_dates=["date"])
    
    # Get station id and name, then remove the first row
    stationId = databasic.padStringToLength(dfTempsFile.loc[0, "site number"], "0", 6)
    stationName = dfTempsFile.loc[0, "site name"]    
    dfTempsFile = dfTempsFile[dfTempsFile["date"].notna()]    

    # Split the date column into Year Month Day
    dfTempsFile["Year"] = dfTempsFile["date"].dt.year
    dfTempsFile["Month"] = dfTempsFile["date"].dt.month
    dfTempsFile["Day"] = dfTempsFile["date"].dt.day
    dfTempsFile["site number"] = stationId
    dfTempsFile["site name"] = stationName
    
    del dfTempsFile["date"]

    # Filter out records that aren't the most recent years
    dfTempsFile = dfTempsFile[dfTempsFile["Year"] >= minYearToLoad]

    # If this is the first file, use this as a base. For all other files, concatenate to the base until we have one df with all records
    if stepper == 0:
      dfAllTemps = dfTempsFile
    else:
      dfAllTemps = pd.concat([dfAllTemps, dfTempsFile])

    stepper += 1


In [63]:
print(dfAllTemps.info())
dfAllTemps.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 306653 entries, 26421 to 28154
Data columns (total 6 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   maximum temperature (degC)  301819 non-null  float64
 1   site number                 306653 non-null  object 
 2   site name                   306653 non-null  object 
 3   Year                        306653 non-null  int64  
 4   Month                       306653 non-null  int64  
 5   Day                         306653 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 16.4+ MB
None


Unnamed: 0,maximum temperature (degC),site number,site name,Year,Month,Day
26421,38.0,1019,KALUMBURU,2014,1,1
26422,37.5,1019,KALUMBURU,2014,1,2
26423,39.2,1019,KALUMBURU,2014,1,3
26424,30.5,1019,KALUMBURU,2014,1,4
26425,37.2,1019,KALUMBURU,2014,1,5


In [64]:
dfAllTemps[["site number", "site name"]].drop_duplicates().head()

Unnamed: 0,site number,site name
26421,1019,KALUMBURU
37987,2079,HALLS CREEK AIRPORT
37986,3003,BROOME AIRPORT
36974,4032,PORT HEDLAND AIRPORT
37987,4106,MARBLE BAR


Join on the Stations

In [65]:
dfAllTemps = dfAllTemps.rename(columns={"site number":"StationId"})
del dfAllTemps["site name"]

In [66]:
dfMerged = pd.merge(dfStations, dfAllTemps, on="StationId")
dfMerged.head()

Unnamed: 0,StationId,Latitude,Longitude,ElevationMs,StationName,maximum temperature (degC),Year,Month,Day
0,1019,-14.3,126.65,23,KALUMBURU,38.0,2014,1,1
1,1019,-14.3,126.65,23,KALUMBURU,37.5,2014,1,2
2,1019,-14.3,126.65,23,KALUMBURU,39.2,2014,1,3
3,1019,-14.3,126.65,23,KALUMBURU,30.5,2014,1,4
4,1019,-14.3,126.65,23,KALUMBURU,37.2,2014,1,5


In [67]:
dfMerged.to_csv(dataFolder + "daily_max_temp_" + str(minYearToLoad) + ".csv", index=False)