In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("‚úÖ Libraries loaded successfully")

‚úÖ Libraries loaded successfully


In [2]:
# Check what files we have
data_path = '../data/raw/'
files = os.listdir(data_path)

print("üìÅ Files in data/raw/:")
for file in sorted(files):
    print(f"  - {file}")

üìÅ Files in data/raw/:
  - rainfall_annual_seasonal.csv
  - rainfall_annual_seasonal_dictionary.csv
  - rainfall_daily.csv
  - rainfall_daily_dictionary.csv
  - rainfall_trends.csv
  - rainfall_trends_dictionary.csv
  - temp_annual_seasonal.csv
  - temp_annual_seasonal_dictionary.csv
  - temp_daily_dictionary.csv
  - temp_daily_part1.csv
  - temp_daily_part2.csv
  - temp_daily_part3.csv
  - temp_national.csv
  - temp_national_dictionary.csv
  - temp_national_trends.csv
  - temp_national_trends_dictionary.csv
  - temp_trends.csv
  - temp_trends_dictionary.csv
  - warm_days.csv
  - warm_days_dictionary.csv
  - warm_days_trends.csv
  - warm_days_trends_dictionary.csv


In [4]:
# Load rainfall annual/seasonal data
rainfall_df = pd.read_csv('../data/raw/rainfall_annual_seasonal.csv')

print("RAINFALL DATA - Annual & Seasonal")
print(f"Shape: {rainfall_df.shape}")
print(f"\nColumns:\n{rainfall_df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(rainfall_df.head())
print(f"\nData types:")
print(rainfall_df.dtypes)
print(f"\nMissing values:")
print(rainfall_df.isnull().sum())

RAINFALL DATA - Annual & Seasonal
Shape: (9420, 10)

Columns:
['site', 'season', 'precipitation', 'precipitation_units', 'period_start', 'period_end', 'lat', 'lon', 'anomaly_1961', 'anomaly_1991']

First 5 rows:
                              site  season  precipitation precipitation_units  \
0              Auckland (Auckland)  Spring          386.0                  mm   
1           Blenheim (Marlborough)  Spring          194.3                  mm   
2        Christchurch (Canterbury)  Spring          183.0                  mm   
3  Dannevirke (Manawat≈´-Whanganui)  Spring          312.6                  mm   
4                  Dunedin (Otago)  Spring          119.8                  mm   

  period_start  period_end       lat        lon  anomaly_1961  anomaly_1991  
0   2022-09-01  2022-11-30 -37.00813  174.78873         130.5         135.1  
1   2022-09-01  2022-11-30 -41.52133  173.86439          23.1          14.8  
2   2022-09-01  2022-11-30 -43.49300  172.53700          47.1     

In [5]:
# Load temperature annual/seasonal data
temp_df = pd.read_csv('../data/raw/temp_annual_seasonal.csv')

print("TEMPERATURE DATA - Annual & Seasonal")

print(f"Shape: {temp_df.shape}")
print(f"\nColumns:\n{temp_df.columns.tolist()}")
print(f"\nFirst 5 rows:")
print(temp_df.head())
print(f"\nData types:")
print(temp_df.dtypes)
print(f"\nMissing values:")
print(temp_df.isnull().sum())

TEMPERATURE DATA - Annual & Seasonal
Shape: (58488, 12)

Columns:
['site', 'statistic', 'season', 'year', 'prop_missing', 'temperature', 'period_start', 'period_end', 'lat', 'lon', 'anomaly', 'reference_period']

First 5 rows:
                  site statistic  season  year  prop_missing  temperature  \
0  Auckland (Auckland)   Average  Autumn  2022           0.0    17.800000   
1  Auckland (Auckland)   Average  Spring  2022           0.0    15.533333   
2  Auckland (Auckland)   Average  Summer  2022           0.0    21.600000   
3  Auckland (Auckland)   Average  Winter  2022           0.0    13.066667   
4  Auckland (Auckland)   Maximum  Autumn  2022           0.0    21.966667   

  period_start  period_end       lat        lon   anomaly reference_period  
0   2022-03-01  2022-05-31 -37.00813  174.78873  1.641333        1961-1990  
1   2022-09-01  2022-11-30 -37.00813  174.78873  1.124000        1961-1990  
2   2021-12-01  2022-02-28 -37.00813  174.78873  2.527778        1961-1990  
3 

In [6]:
# Summary of datasets
print("\n" + "=" * 60)
print("DATA INVENTORY SUMMARY")
print("=" * 60)

print("\nüìä RAINFALL DATA:")
print(f"  ‚Ä¢ Sites: {rainfall_df['site'].nunique()}")
print(f"  ‚Ä¢ Seasons: {rainfall_df['season'].nunique()}")
print(f"  ‚Ä¢ Year range: 1960-2022")
print(f"  ‚Ä¢ Total records: {len(rainfall_df):,}")
print(f"  ‚Ä¢ Missing values: {rainfall_df.isnull().sum().sum()}")

print("\nüå°Ô∏è TEMPERATURE DATA:")
print(f"  ‚Ä¢ Sites: {temp_df['site'].nunique()}")
print(f"  ‚Ä¢ Statistics: {temp_df['statistic'].unique()}")
print(f"  ‚Ä¢ Year range: {temp_df['year'].min()}-{temp_df['year'].max()}")
print(f"  ‚Ä¢ Total records: {len(temp_df):,}")
print(f"  ‚Ä¢ Missing temps: {temp_df['temperature'].isnull().sum()}")
print(f"  ‚Ä¢ Missing anomalies: {temp_df['anomaly'].isnull().sum()}")


DATA INVENTORY SUMMARY

üìä RAINFALL DATA:
  ‚Ä¢ Sites: 30
  ‚Ä¢ Seasons: 5
  ‚Ä¢ Year range: 1960-2022
  ‚Ä¢ Total records: 9,420
  ‚Ä¢ Missing values: 0

üå°Ô∏è TEMPERATURE DATA:
  ‚Ä¢ Sites: 30
  ‚Ä¢ Statistics: ['Average' 'Maximum' 'Minimum']
  ‚Ä¢ Year range: 1928-2022
  ‚Ä¢ Total records: 58,488
  ‚Ä¢ Missing temps: 516
  ‚Ä¢ Missing anomalies: 11082
