# Assemble Data

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
from scipy.ndimage import uniform_filter1d  # simple smoothing
import re

In [None]:
pd.set_option('display.max_columns', None)  # show all columns
# pd.set_option('display.max_rows', None)  # show all columns
# import os
# os.chdir('..')

### Start with plot features

In [None]:
# get elevation features
df = pd.read_pickle('../data/plot_elev_features.pkl')

In [None]:
df

### Compute vectors for aspect and slope, and some interaction terms

In [None]:
# Aspect: convert to radians and compute sin/cos
df['aspect_min_cos'] = np.cos(np.radians(df['aspect_min']))
df['aspect_min_sin'] = np.sin(np.radians(df['aspect_min']))

df['aspect_max_cos'] = np.cos(np.radians(df['aspect_max']))
df['aspect_max_sin'] = np.sin(np.radians(df['aspect_max']))

df['aspect_mean_cos'] = np.cos(np.radians(df['aspect_mean']))
df['aspect_mean_sin'] = np.sin(np.radians(df['aspect_mean']))

# Drop raw aspect values
df = df.drop(columns=['aspect_min', 'aspect_max', 'aspect_mean'])

df['slope_rad'] = np.radians(df['slope_mean'])
df['slope_grad'] = np.tan(df['slope_rad'])


df['slope_x'] = df['slope_grad'] * df['aspect_mean_cos']
df['slope_y'] = df['slope_grad'] * df['aspect_mean_sin']

df = df.drop(columns = ['slope_mean','slope_min','slope_max'])

In [None]:
df

## Now add NDVI for each plot to features

### open up the filtered and smoothed ndvi df

In [None]:
veg_agg = pd.read_pickle('../data/ndvi/plots/final_df.pkl')
keep_cols = ['plot_id', 'year']

In [None]:
veg_agg = veg_agg.dropna(axis = 1)
veg_agg

In [None]:
df = df.merge(veg_agg, how = 'inner', on = 'plot_id')

In [None]:
df

### Now let's add some weather information

First load the wather that has been unzipped and clipped to the vineyard

In [None]:
weather = pd.read_pickle('../data/PRISM/df.pkl')



weather = (
    weather
    .groupby("date", as_index=False)
    .first()
)


weather['date'] = pd.to_datetime(weather['date'])
weather['doy'] = weather['date'].dt.dayofyear
weather['year'] = weather['date'].dt.year

weather = weather.drop(weather[weather['year'] == 2025].index)
weather

### Crack at some frost and growing degree days cumulative per month

In [None]:
# Compute frost days and GDD
weather['frost'] = (weather['tmin'] < 0)
weather['gdd'] = (weather['tmean'] - 10).clip(lower=0) 

# disregard 2025 for now
weather = weather[weather['year'] != 2025].copy()

In [None]:
# Compute cumulative GDD for each month
weather['week'] = weather['date'].dt.week
cumulative_gdd = weather.groupby([ 'year', 'week'])['gdd'].sum().groupby(level=[0,1]).cumsum()
# Reset index to turn MultiIndex into columns
cumulative_gdd = cumulative_gdd.rename('cumulative_gdd').reset_index()

# Merge back to weather
weather = weather.merge(
    cumulative_gdd,
    on=['year', 'week'],
    how='left'
)


weather

In [None]:
weather = weather[
    (weather['week'] > 27) &
    (weather['week'] < 44)
].copy()

In [None]:
# Define which aggregations to use
agg_funcs = {
    'ppt': 'sum',
    'tmax': 'max',
    'tmin': 'min',
    'tmean': 'mean',
    'vpdmax': 'max',
    'vpdmin': 'min',
    'cumulative_gdd': 'max'
}

# Aggregate to long-form first (year, week, variables)
weekly_long = weather.groupby(['year','week']).agg(agg_funcs).reset_index()

# Pivot to wide form (one row per year, columns per week)
weekly_wide = pd.DataFrame({'year': weekly_long['year'].unique()})

for col in ['ppt','tmax','tmin','tmean','vpdmax','vpdmin','cumulative_gdd']:
    pivoted = weekly_long.pivot(index='year', columns='week', values=col)
    # Rename columns to include variable name
    pivoted.columns = [f'{col}_{w}' for w in pivoted.columns]
    weekly_wide = weekly_wide.merge(pivoted, on='year', how='left')

weekly_wide.reset_index(drop=True, inplace=True)

In [None]:
weekly_wide

### Now we can combine data.

In [None]:
df['year'] = df['year'].astype(int)
df = pd.merge(df, weekly_wide, how = 'inner', on = 'year')

In [None]:
df

In [None]:
# # Optional: could also add slope magnitude transforms
# df['slope_squared'] = df['slope_mean'] ** 2
# df['slope_log'] = np.log1p(df['slope_mean'])  # log(1 + slope)

for i in range(28, 44, 1):
    
    df[f'water_availability_{i}'] = df[f'ppt_{i}'] / (1 +  df[f'cumulative_gdd_{i}'])
    df[f'diurnal_temp_range_{i}'] = df[f'tmax_{i}'] / df[f'tmin_{i}']
    df[f'stress_index{i}'] = df[f'vpdmax_{i}'] / (df[f'ppt_{i}'] + 0.1)
df['local_relief'] = df['elev_mean'] - df['elev_min']

df['total_relief_log'] = np.log1p(df['total_relief'])

In [None]:
soil = pd.read_pickle('../data/soil/plot_summary.pkl')

In [None]:
df = pd.merge(df, soil, how = 'inner', on = 'plot_id')

In [None]:
df = pd.DataFrame(df.drop(columns = 'geometry'))

In [None]:
df

In [None]:
df.to_pickle('../data/df.pkl')