In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
rainfall_df = pd.read_csv('../data/raw/rainfall_annual_seasonal.csv')
#print(rainfall_df[['period_start', 'period_end']].head())

rainfall_df['period_start'] = pd.to_datetime(rainfall_df['period_start'])
rainfall_df['period_end'] = pd.to_datetime(rainfall_df['period_end'])

rainfall_df['year'] = rainfall_df['period_start'].dt.year
print(f"Year range: {rainfall_df['year'].min()} to {rainfall_df['year'].max()}")
print(f"\nUnique years: {rainfall_df['year'].nunique()}")

# Filter to 1990-2022
rainfall_clean = rainfall_df[rainfall_df['year'] >= 1990].copy()
print(f"Original records: {len(rainfall_df):,}")
print(f"Filtered (1990-2022): {len(rainfall_clean):,}")
print(f"Years: {rainfall_clean['year'].min()} to {rainfall_clean['year'].max()}")
print(f"\nSites: {rainfall_clean['site'].nunique()}")
print(f"Seasons: {rainfall_clean['season'].unique()}")


Year range: 1960 to 2022

Unique years: 63
Original records: 9,420
Filtered (1990-2022): 4,920
Years: 1990 to 2022

Sites: 30
Seasons: ['Spring' 'Winter' 'Autumn' 'Annual' 'Summer']


In [11]:
temp_df = pd.read_csv('../data/raw/temp_annual_seasonal.csv')

print(f"Original shape: {temp_df.shape}")
print(f"Year range: {temp_df['year'].min()} to {temp_df['year'].max()}")
print(f"\nMissing values:")
print(temp_df[['temperature', 'anomaly']].isnull().sum())

# Filter to 1990-2022
temp_clean = temp_df[temp_df['year'] >= 1990].copy()

print(f"Filtered records: {len(temp_clean):,}")
print(f"\nMissing temps after filter: {temp_clean['temperature'].isnull().sum()}")

# Drop rows with missing temperature (small amount)
temp_clean = temp_clean.dropna(subset=['temperature'])

print(f"After dropping missing temps: {len(temp_clean):,}")
print(f"Missing temps now: {temp_clean['temperature'].isnull().sum()}")
print(f"\nSites: {temp_clean['site'].nunique()}")
print(f"Statistics types: {temp_clean['statistic'].unique()}")

Original shape: (58488, 12)
Year range: 1928 to 2022

Missing values:
temperature      516
anomaly        11082
dtype: int64
Filtered records: 29,700

Missing temps after filter: 0
After dropping missing temps: 29,700
Missing temps now: 0

Sites: 30
Statistics types: ['Average' 'Maximum' 'Minimum']


In [12]:
# Save cleaned datasets
rainfall_clean.to_csv('../data/processed/rainfall_clean.csv', index=False)
temp_clean.to_csv('../data/processed/temperature_clean.csv', index=False)

print("‚úÖ Cleaned data saved to data/processed/")
print(f"\nüìä Final datasets:")
print(f"  Rainfall: {len(rainfall_clean):,} records")
print(f"  Temperature: {len(temp_clean):,} records")
print(f"  Period: 1990-2022 (33 years)")
print(f"  Sites: 30 locations across NZ")

‚úÖ Cleaned data saved to data/processed/

üìä Final datasets:
  Rainfall: 4,920 records
  Temperature: 29,700 records
  Period: 1990-2022 (33 years)
  Sites: 30 locations across NZ


In [13]:
# Summary statistics
print("="*60)
print("DATA CLEANING SUMMARY")
print("="*60)

print("\nüåßÔ∏è RAINFALL (1990-2022):")
print(f"  Records: {len(rainfall_clean):,}")
print(f"  Sites: {rainfall_clean['site'].nunique()}")
print(f"  Avg annual rainfall: {rainfall_clean[rainfall_clean['season']=='Annual']['precipitation'].mean():.1f} mm")

print("\nüå°Ô∏è TEMPERATURE (1990-2022):")
print(f"  Records: {len(temp_clean):,}")
print(f"  Sites: {temp_clean['site'].nunique()}")
avg_temp = temp_clean[(temp_clean['season']=='Annual') & (temp_clean['statistic']=='Average')]
print(f"  Avg annual temp: {avg_temp['temperature'].mean():.1f}¬∞C")

DATA CLEANING SUMMARY

üåßÔ∏è RAINFALL (1990-2022):
  Records: 4,920
  Sites: 30
  Avg annual rainfall: 1278.7 mm

üå°Ô∏è TEMPERATURE (1990-2022):
  Records: 29,700
  Sites: 30
  Avg annual temp: 12.6¬∞C
