# Process Climate Processing Centre (CPC) Outlook

### Prepare Workspace

In [1]:
# Import system libraries
import os
import sys

# Import data manipulation libraries
import pandas as pd
from shapely.geometry import Polygon

# Import geospatial libraries
import geopandas as gpd

# Set working directory
os.chdir('/Users/jessicarapson/Documents/GitHub/water-supply-forecast')

### Process Precipitation Forecast

In [5]:
# Create list to store data
year_data = []

# Set header
header_line = ['YEAR','MN','LEAD','CD','R','98','95','90','80','70','60','50','40',
               '30','20','10','5','2','F MEAN','C MEAN','F SD','C SD','POW']

# Loop through years
for i in range(1994,2024):
    
    # Set path to data
    path = 'assets/data/cpc_outlooks/raw_data/cpcllfpd.' + str(i) + '.dat'
    
    # Open the .dat file in read mode
    with open(path, 'r') as file:
        # Skip the first line
        file.readline()

        # Read the rest of the file contents
        data = file.readlines()[1:]

    # Process the data to extract information 
    parsed_data = []
    for line in data:
        # Split each line on the spaces
        row = line.strip().split(' ')
        parsed_data.append([i for i in row if i != ''][:23])
        
    year_data.extend(parsed_data)

# Create a DataFrame
df_prec = pd.DataFrame(year_data, columns=header_line)

# Remove rows with no data
df_prec['POW'] = pd.to_numeric(df_prec['POW'], errors='coerce')
df_prec = df_prec[pd.notnull(df_prec['POW'])]

### Process Temperature Forecast

In [6]:
# Create list to store data
year_data = []

# Set header
header_line = ['YEAR','MN','LEAD','CD','R','98','95','90','80','70','60','50','40',
              '30','20','10','5','2','F MEAN','C MEAN','F SD','C S']

# Loop through years
for i in range(1994,2024):
    
    # Set path to data
    path = 'assets/data/cpc_outlooks/raw_data/cpcllftd.' + str(i) + '.dat'
    
    # Open the .dat file in read mode
    with open(path, 'r') as file:
        # Skip the first line
        file.readline()

        # Read the rest of the file contents
        data = file.readlines()[1:]

    # Process the data to extract information 
    parsed_data = []
    for line in data:
        # Split each line on the spaces
        row = line.strip().split(' ')
        parsed_data.append([i for i in row if i != ''][:22])
        
    year_data.extend(parsed_data)

# Create a DataFrame
df_temp = pd.DataFrame(year_data, columns=header_line)

# Remove rows with no data
df_temp['C S'] = pd.to_numeric(df_temp['C S'], errors='coerce')
df_temp = df_temp[pd.notnull(df_temp['C S'])]
df_temp['98'] = pd.to_numeric(df_temp['98'], errors='coerce')
df_temp = df_temp[pd.notnull(df_temp['98'])]

# Export to CSV
df_temp.to_csv('assets/data/cpc_outlooks/cpc_temp.csv', index=False)  

### Join Data to Forecast Sites

In [39]:
# Load the spatial data
gdf_cd = gpd.read_file('assets/data/cpc_climate_divisions.gpkg')
gdf_sites = gpd.read_file('assets/data/geospatial.gpkg')

# Perform a spatial join between the GeoDataFrames
joined = gpd.sjoin(gdf_sites, gdf_cd, how='left', op='intersects')

# Group by 'site_id' and 'CD' and calculate the area of the intersection
grouped = joined.groupby(['site_id', 'CD']).size().reset_index(name='count')

# Find the index of the maximum count for each 'site_id'
max_count = grouped.groupby('site_id')['count'].idxmax()

# Get the corresponding 'CD' with the largest overlapping area for each 'site_id'
largest_CD_per_site = grouped.loc[max_count]

# Merge the 'site_id' data with the corresponding 'CD' containing the largest overlap
site_to_cd_dict = largest_CD_per_site.merge(gdf_cd, on='CD', how='left')[['site_id','CD']]

# Right join data on site for precipitation
df_prec['CD'] = df_prec['CD'].apply(int)
df_prec = pd.merge(df_prec, site_to_cd_dict, on='CD', how='right')

# Right join data on site for temperature
df_temp['CD'] = df_temp['CD'].apply(int)
dfdf_temp_prec = pd.merge(df_temp, site_to_cd_dict, on='CD', how='right')

### Export Cleaned Data

In [40]:
# Export precipitation data
df_prec.to_csv('assets/data/cpc_outlooks/cpc_prec.csv', index=False)  

# Export temperature data
df_temp.to_csv('assets/data/cpc_outlooks/cpc_temp.csv', index=False)  