In [1]:
import os
import zipfile
import glob
from tqdm import tqdm
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import numpy as np
import pandas as pd
import shutil
vineyard = gpd.read_file('data/polygons/RegressionRidge.geojson')


In [2]:
base_dir = 'data/PRISM'

weather_features = ['ppt','tmax','tmean','tmin','vpdmax','vpdmin']
weather_folders = [
    os.path.join(base_dir, f) for f in weather_features
]

years = range(2016, 2025, 1)

weather_folders_years = []
for folder in weather_folders:
    feature_years = [
        os.path.join(folder, str(year)) for year in years
    ]
    weather_folders_years.append(feature_years)

# wfy = weather feature years
# collapse weather_features_years into single list of data folders
weather_folders_years = [item for sublist in weather_folders_years for item in sublist]

In [4]:
def unzip_file(zip_file_path, out_dir):
    with zipfile.ZipFile(zip_file_path, 'r') as z:
        # pick a representative file (e.g. the first in the archive)
        first_member = z.namelist()[0]
        out_path = os.path.join(out_dir, first_member)

        if not os.path.exists(out_path):
            z.extractall(out_dir)
            
        # get a list of files to delete later
        extracted_files = [os.path.join(out_dir, f) for f in z.namelist()]

    return extracted_files


def clip_raster(file_path, weather_value, polygon = vineyard):
    
    with rasterio.open(f) as src:
        for idx, row in vineyard.iterrows():
            geom = [row.geometry.__geo_interface__]

            # clip raster to polygon
            out_image, out_transform = mask(src, geom, crop=True)
            data = out_image[0]

            # ignore no-data values
            nodata = src.nodata if src.nodata is not None else -9999
            vals = data[data != nodata]

            if len(vals) > 0:
                n_val = len(vals)
                value = float(vals)
                value_mean = float(np.mean(vals))
            else:
                mean_val, min_val = np.nan, np.nan
            
            return {
                'date':date, 
                'n':n_val,
                f'{weather_value}':value,
                # f'{weather_value}_mean':value_mean,
                # f'{weather_value}_std':value_std
            }

        
def delete_extracted_files(file_list):
    """
    Delete a list of files.
    
    Args:
        file_list (list of str): paths to files to delete
    """
    for f in file_list:
        if os.path.exists(f):
            os.remove(f)

In [5]:
results = []
for feature in weather_features:
    print(feature)
    wfy = [w for w in weather_folders_years if feature in w]
    wfye = [os.path.join(fy, 'extracted') for fy in wfy]
    for zip_dir, out_dir in tqdm(zip(wfy, wfye)):
        # print(zip_dir)
        # make an output folder
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        # iterate through zipfiles
        for zf in (glob.glob(os.path.join(zip_dir, '*.zip'))):
            # skip provisional files -- it makes crash
            if 'provisional' in zf:
                continue

            # extract files and return variable to delete them later
            extracted_files = unzip_file(zf, out_dir)

            # find the bil files to clip
            bil_files = [f for f in glob.glob(os.path.join(out_dir, "*.bil"))]

            # run through daily bil files and clip to vineyard polygon
            for f in bil_files:
                date_str = os.path.basename(f).split("_")[4]  # e.g. 20240101
                date = pd.to_datetime(date_str)
                res = clip_raster(f, weather_value = feature, polygon = vineyard)
                results.append(res)

            # remove extracted files to prevent data balloon
            delete_extracted_files(extracted_files)


ppt


9it [01:25,  9.52s/it]


tmax


9it [01:35, 10.65s/it]


tmean


9it [01:18,  8.77s/it]


tmin


9it [01:35, 10.60s/it]


vpdmax


9it [01:27,  9.70s/it]


vpdmin


9it [01:25,  9.46s/it]


In [6]:
df = pd.DataFrame(results)
df

Unnamed: 0,date,n,ppt,tmax,tmean,tmin,vpdmax,vpdmin
0,2016-04-16,1,0.00,,,,,
1,2016-01-02,1,0.00,,,,,
2,2016-11-30,1,0.00,,,,,
3,2016-12-01,1,0.07,,,,,
4,2016-06-08,1,0.00,,,,,
...,...,...,...,...,...,...,...,...
19723,2024-01-11,1,,,,,,0.3613
19724,2024-10-19,1,,,,,,0.2942
19725,2024-02-28,1,,,,,,0.4319
19726,2024-05-23,1,,,,,,0.4454


In [8]:
df.to_pickle('data/PRISM/df.pkl')