In [9]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
from pathlib import Path

# --------- Paths ---------
IN_CSV  = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv"
SOIL    = "/explore/nobackup/people/spotter5/anna_v/v2/integrated_soil_data_1km_v2_sites.csv"
LC      = "/explore/nobackup/people/spotter5/anna_v/v2/extracted_landcover_values_v2.csv"
SM      = "/explore/nobackup/people/spotter5/anna_v/v2/soil_moisture_by_site_monthly_2000_2023.csv"
CO2     = "/explore/nobackup/people/spotter5/anna_v/v2/co2_cont.csv"
ALT     = "/explore/nobackup/people/spotter5/anna_v/v2/ALT_by_site.csv"

OUT_CSV = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_v4_lagged.csv"

# --------- Helper: context builder for ONE site ---------
def build_ctx_for_site(df_site: pd.DataFrame) -> pd.DataFrame:
    """
    Apply seasonal/context logic to ONE site's data and return df with 6 *_ctx columns added.
    Rules:
      - Summer obs (JJA): temp/pr = mean(Dec(prev)–Apr(curr)), snow = mean(Dec(prev)–Mar(curr)), ndvi = current NDVI
      - Fall (SON):       temp/pr/ndvi = same-year JJA means; snow = mean(Dec(prev)–Mar(curr))
      - Winter (DJFM):    temp/pr/ndvi = previous-year JJA means; snow = current month (original)
      - Spring (AM):      temp/pr/ndvi = previous-year JJA means; snow = current month (original)
      - Fill remaining NaNs using original values, then same-month means (within site), then site means.
    """
    need = ['year','month','tmmn','tmmx','pr','NDVI','snow_cover','snow_depth','NDSI_snow_cover']
    miss = [c for c in need if c not in df_site.columns]
    if miss:
        raise ValueError(f"Missing columns for context logic: {miss}")

    df = df_site.copy()
    df['tmean_C'] = df[['tmmn','tmmx']].mean(axis=1)
    df['year']  = pd.to_numeric(df['year'], errors='coerce').astype(int)
    df['month'] = pd.to_numeric(df['month'], errors='coerce').astype(int)

    # Monthly means (within site) for building aggregates
    monthly = (
        df.groupby(['year','month'], as_index=False)
          .agg({'tmean_C':'mean','pr':'mean','NDVI':'mean',
                'snow_cover':'mean','snow_depth':'mean','NDSI_snow_cover':'mean'})
          .sort_values(['year','month'])
    )

    # JJA per year (same-year)
    summer = (
        monthly[monthly['month'].isin([6,7,8])]
        .groupby('year', as_index=False)
        .agg({'tmean_C':'mean','pr':'mean','NDVI':'mean'})
        .rename(columns={'tmean_C':'summer_tmean','pr':'summer_pr','NDVI':'summer_ndvi'})
    )

    # Dec(prev)–Apr(curr) mapped to curr year (for JJA temp/pr)
    win_DecApr = monthly[monthly['month'].isin([12,1,2,3,4])].copy()
    win_DecApr['target_year'] = win_DecApr['year'] + (win_DecApr['month'] == 12).astype(int)
    win_DecApr = (
        win_DecApr.groupby('target_year', as_index=False)
                  .agg({'tmean_C':'mean','pr':'mean'})
                  .rename(columns={'target_year':'year',
                                   'tmean_C':'Decprev_to_Apr_tmean',
                                   'pr':'Decprev_to_Apr_pr'})
    )

    # Dec(prev)–Mar(curr) mapped to curr year (for JJA & Fall snow)
    win_DecMar = monthly[monthly['month'].isin([12,1,2,3])].copy()
    win_DecMar['target_year'] = win_DecMar['year'] + (win_DecMar['month'] == 12).astype(int)
    win_DecMar = (
        win_DecMar.groupby('target_year', as_index=False)
                  .agg({'snow_cover':'mean','snow_depth':'mean','NDSI_snow_cover':'mean'})
                  .rename(columns={'target_year':'year',
                                   'snow_cover':'Decprev_to_Mar_snow_cover',
                                   'snow_depth':'Decprev_to_Mar_snow_depth',
                                   'NDSI_snow_cover':'Decprev_to_Mar_NDSI'})
    )

    out = df.copy()
    for c in ['temp_ctx','pr_ctx','ndvi_ctx',
              'snow_cover_ctx','snow_depth_ctx','ndsi_snow_cover_ctx']:
        out[c] = pd.NA

    is_summer  = out['month'].isin([6,7,8])      # JJA
    is_fall    = out['month'].isin([9,10,11])    # SON
    is_winter  = out['month'].isin([12,1,2,3])   # DJFM
    is_spring  = out['month'].isin([4,5])        # AM

    # A) Summer rows
    tmp = out.loc[is_summer, ['year','month']].merge(win_DecApr, on='year', how='left')
    out.loc[is_summer, 'temp_ctx'] = tmp['Decprev_to_Apr_tmean'].values
    out.loc[is_summer, 'pr_ctx']   = tmp['Decprev_to_Apr_pr'].values
    tmp = out.loc[is_summer, ['year','month']].merge(win_DecMar, on='year', how='left')
    out.loc[is_summer, 'snow_cover_ctx']      = tmp['Decprev_to_Mar_snow_cover'].values
    out.loc[is_summer, 'snow_depth_ctx']      = tmp['Decprev_to_Mar_snow_depth'].values
    out.loc[is_summer, 'ndsi_snow_cover_ctx'] = tmp['Decprev_to_Mar_NSI'].values if 'Decprev_to_Mar_NSI' in tmp.columns else tmp['Decprev_to_Mar_NDSI'].values
    out.loc[is_summer, 'ndvi_ctx'] = out.loc[is_summer, 'NDVI'].values

    # B) Fall rows (same-year JJA for temp/pr/ndvi; Decprev–Mar for snow)
    tmp = out.loc[is_fall, ['year','month']].merge(summer, on='year', how='left')
    out.loc[is_fall, 'temp_ctx'] = tmp['summer_tmean'].values
    out.loc[is_fall, 'pr_ctx']   = tmp['summer_pr'].values
    out.loc[is_fall, 'ndvi_ctx'] = tmp['summer_ndvi'].values
    tmp = out.loc[is_fall, ['year','month']].merge(win_DecMar, on='year', how='left')
    out.loc[is_fall, 'snow_cover_ctx']      = tmp['Decprev_to_Mar_snow_cover'].values
    out.loc[is_fall, 'snow_depth_ctx']      = tmp['Decprev_to_Mar_snow_depth'].values
    out.loc[is_fall, 'ndsi_snow_cover_ctx'] = tmp['Decprev_to_Mar_NDSI'].values

    # C) Winter rows (prev-year JJA for temp/pr/ndvi; snow = original)
    prev = out.loc[is_winter, ['year','month']].copy()
    prev = prev.merge(summer.rename(columns={'year':'key_year'}),
                      left_on=(prev['year'] - 1), right_on='key_year', how='left')
    out.loc[is_winter, 'temp_ctx'] = prev['summer_tmean'].values
    out.loc[is_winter, 'pr_ctx']   = prev['summer_pr'].values
    out.loc[is_winter, 'ndvi_ctx'] = prev['summer_ndvi'].values
    out.loc[is_winter, 'snow_cover_ctx']      = out.loc[is_winter, 'snow_cover'].values
    out.loc[is_winter, 'snow_depth_ctx']      = out.loc[is_winter, 'snow_depth'].values
    out.loc[is_winter, 'ndsi_snow_cover_ctx'] = out.loc[is_winter, 'NDSI_snow_cover'].values

    # D) Spring rows (prev-year JJA for temp/pr/ndvi; snow = original)
    prev = out.loc[is_spring, ['year','month']].copy()
    prev = prev.merge(summer.rename(columns={'year':'key_year'}),
                      left_on=(prev['year'] - 1), right_on='key_year', how='left')
    out.loc[is_spring, 'temp_ctx'] = prev['summer_tmean'].values
    out.loc[is_spring, 'pr_ctx']   = prev['summer_pr'].values
    out.loc[is_spring, 'ndvi_ctx'] = prev['summer_ndvi'].values
    out.loc[is_spring, 'snow_cover_ctx']      = out.loc[is_spring, 'snow_cover'].values
    out.loc[is_spring, 'snow_depth_ctx']      = out.loc[is_spring, 'snow_depth'].values
    out.loc[is_spring, 'ndsi_snow_cover_ctx'] = out.loc[is_spring, 'NDSI_snow_cover'].values

    # No-NaN fallbacks (within site)
    out['temp_ctx'] = out['temp_ctx'].fillna(out['tmean_C'])
    out['pr_ctx']   = out['pr_ctx'].fillna(out['pr'])
    out['ndvi_ctx'] = out['ndvi_ctx'].fillna(out['NDVI'])
    out['snow_cover_ctx']      = out['snow_cover_ctx'].fillna(out['snow_cover'])
    out['snow_depth_ctx']      = out['snow_depth_ctx'].fillna(out['snow_depth'])
    out['ndsi_snow_cover_ctx'] = out['ndsi_snow_cover_ctx'].fillna(out['NDSI_snow_cover'])

    # same-month means within site
    month_means = (
        out.groupby('month')[['temp_ctx','pr_ctx','ndvi_ctx',
                              'snow_cover_ctx','snow_depth_ctx','ndsi_snow_cover_ctx']]
          .transform('mean')
    )
    for col in ['temp_ctx','pr_ctx','ndvi_ctx',
                'snow_cover_ctx','snow_depth_ctx','ndsi_snow_cover_ctx']:
        out[col] = out[col].fillna(month_means[col])

    # site overall means last
    for col in ['temp_ctx','pr_ctx','ndvi_ctx',
                'snow_cover_ctx','snow_depth_ctx','ndsi_snow_cover_ctx']:
        out[col] = out[col].fillna(out[col].mean())

    return out

# --------- 1) Build context (all sites) ---------
input_data = pd.read_csv(IN_CSV)

# Keep EC only, valid site_reference, and years >= 2001 (as in your merge script)
input_data = input_data[(input_data['flux_method'] == 'EC') &
                        (input_data['year'] >= 2001)].copy()
input_data = input_data.dropna(subset=['site_reference'])

# Apply per-site context logic, then recombine
parts = []
for site, g in input_data.groupby('site_reference', group_keys=False):
    parts.append(build_ctx_for_site(g))
ctx_all = pd.concat(parts, ignore_index=True)

# --------- 2) Merge additional datasets ---------
soil       = pd.read_csv(SOIL)
landcover  = pd.read_csv(LC)[['site_refer','land_cover_code']]
sm         = pd.read_csv(SM)
cont       = pd.read_csv(CO2)
alt        = pd.read_csv(ALT)

# Normalize types
for df in [ctx_all, alt, sm]:
    if 'site_reference' in df.columns:
        df['site_reference'] = df['site_reference'].astype(str)
for df in [soil, landcover]:
    if 'site_refer' in df.columns:
        df['site_refer'] = df['site_refer'].astype(str)

for df in [ctx_all, alt, sm, cont]:
    if 'year' in df.columns:
        df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
    if 'month' in df.columns:
        df['month'] = pd.to_numeric(df['month'], errors='coerce').astype('Int64')

# Deduplicate on merge keys
ctx_all   = ctx_all.drop_duplicates(subset=['site_reference','year','month'])
soil      = soil.drop_duplicates(subset=['site_refer'])
landcover = landcover.drop_duplicates(subset=['site_refer'])
alt       = alt.drop_duplicates(subset=['site_reference','year'])
sm        = sm.drop_duplicates(subset=['site_reference','year','month'])

# Soil (static); keep only 100cm cols
soil_100 = soil.filter(regex='100cm$').copy()
soil_100['site_reference'] = soil['site_refer'].values
ctx_all = ctx_all.merge(soil_100, on="site_reference", how="left", validate="m:1")

# Land cover (static)
landcover = landcover.rename(columns={'site_refer':'site_reference',
                                      'land_cover_code':'land_cover'})
landcover = landcover[['site_reference','land_cover']]
ctx_all = ctx_all.merge(landcover, on="site_reference", how="left", validate="m:1")

# CO2 (year/month)
co2_to_merge = cont[['year','month','value']].drop_duplicates(subset=['year','month']).rename(columns={'value':'co2_cont'})
ctx_all = ctx_all.merge(co2_to_merge, on=['year','month'], how='left', validate="m:1")

# ALT (site/year)
alt_to_merge = alt[['site_reference','year','ALT']].drop_duplicates(subset=['site_reference','year'])
ctx_all = ctx_all.merge(alt_to_merge, on=['site_reference','year'], how='left', validate="m:1")

# Soil moisture (site/year/month)
needed_cols = {'site_reference','year','month','sm_surface','sm_rootzone'}
missing = needed_cols.difference(set(sm.columns))
if missing:
    raise ValueError(f"Soil moisture CSV missing columns: {missing}")
ctx_all = ctx_all.merge(
    sm[['site_reference','year','month','sm_surface','sm_rootzone']],
    on=['site_reference','year','month'], how='left', validate='m:1'
)

# Land cover type cast
if 'land_cover' in ctx_all.columns:
    ctx_all['land_cover'] = ctx_all['land_cover'].fillna(-9999).astype(int)

# --------- Save ---------
Path(OUT_CSV).parent.mkdir(parents=True, exist_ok=True)
ctx_all.to_csv(OUT_CSV, index=False)

print("Saved:", OUT_CSV)
print(ctx_all[['site_reference','year','month','tmmn','tmmx','tmean_C','pr','NDVI',
               'temp_ctx','pr_ctx','ndvi_ctx',
               'snow_cover','snow_cover_ctx',
               'snow_depth','snow_depth_ctx',
               'NDSI_snow_cover','ndsi_snow_cover_ctx']].head(12))


  input_data = pd.read_csv(IN_CSV)


Saved: /explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_v4_lagged.csv
                 site_reference  year  month  tmmn  tmmx  tmean_C    pr  \
0   ARM-NSA-Barrow_US-A10_tower  2001      7   0.8   7.6     4.20  36.0   
1   ARM-NSA-Barrow_US-A10_tower  2001      8   0.1   5.3     2.70  33.0   
2   ARM-NSA-Barrow_US-A10_tower  2002      7   1.0   8.3     4.65   2.0   
3   ARM-NSA-Barrow_US-A10_tower  2002      8   0.4   5.9     3.15  23.0   
4   ARM-NSA-Barrow_US-A10_tower  2002      9   0.3   4.7     2.50  51.0   
5   ARM-NSA-Barrow_US-A10_tower  2003      7   1.6   8.7     5.15  24.0   
6   ARM-NSA-Barrow_US-A10_tower  2003      8   0.7   5.3     3.00  23.0   
7   ARM-NSA-Barrow_US-A10_tower  2003      9  -1.2   2.0     0.40  35.0   
8   ARM-NSA-Barrow_US-A10_tower  2004      7   2.9  10.4     6.65  38.0   
9   ARM-NSA-Barrow_US-A10_tower  2004      8   4.4  10.1     7.25  22.0   
10  ARM-NSA-Barrow_US-A10_tower  2004      9  -1.2   2.6     0.70  34.0   
11  ARM-NSA-Barro

In [11]:
import pandas as pd

df0 = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv")

df0.columns

df0 = df0[df0['year'] >= 2001]
site_ref = 'ARM-NSA-Barrow_US-A10_tower'
#'Zackenberg Heath_GL-ZaH_tower'
df0 = df0[df0['site_reference'] == site_ref]


df0['tmean_C'] = df0[['tmmn', 'tmmx']].mean(axis=1)
df0 = df0[['year', 'month', 'nee', 'tmean_C', 'pr', 'NDVI', 'snow_cover', 'snow_depth', 'NDSI_snow_cover']]
df0['year'] = df0['year'].astype(int)
df0['month'] = df0['month'].astype(int)

# # Collapse duplicates to monthly means (if duplicates exist for a month)
monthly = (
    df0.groupby(['year', 'month'], as_index=False)
       .agg({'tmean_C':'mean', 'pr':'mean'})
       .sort_values(['year','month'])
)

df0

  df0 = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv")


Unnamed: 0,year,month,nee,tmean_C,pr,NDVI,snow_cover,snow_depth,NDSI_snow_cover
16079,2001,7,,4.20,36.0,0.2739,0.590437,0.000524,-9999.0
18223,2001,8,,2.70,33.0,0.3450,1.989337,0.001972,-9999.0
28529,2002,7,,4.65,2.0,0.5478,1.012882,0.001026,-9999.0
30673,2002,8,,3.15,23.0,0.5426,3.026446,0.003077,-9999.0
32816,2002,9,,2.50,51.0,0.4426,5.878448,0.005916,-9999.0
...,...,...,...,...,...,...,...,...,...
637713,2024,3,,-24.95,3.0,,98.447266,0.646442,-9999.0
639362,2024,4,,-16.70,7.0,,98.447266,0.596106,-9999.0
640335,2024,5,,-6.00,5.0,,98.256988,0.391686,-9999.0
641744,2024,11,,-12.55,11.0,,98.447266,0.207726,-9999.0


In [12]:
df0['NDSI_snow_cover'].unique()

array([-9999.])

In [18]:
import pandas as pd

df0 = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv")

df0.columns

df0 = df0[df0['year'] >= 2001]
site_ref = 'ARM-NSA-Barrow_US-A10_tower'
#'Zackenberg Heath_GL-ZaH_tower'
site_ref = 'Iskoras_NO-Isk-fen_tower'
df0 = df0[df0['site_reference'] == site_ref]


df0['tmean_C'] = df0[['tmmn', 'tmmx']].mean(axis=1)
df0 = df0[['year', 'month', 'nee', 'tmean_C', 'pr', 'NDVI', 'Percent_Tree_Cover', 
           'Percent_NonVegetated', 'Percent_NonTree_Vegetation', 'snow_cover', 'snow_depth', 'NDSI_snow_cover']]
df0['year'] = df0['year'].astype(int)
df0['month'] = df0['month'].astype(int)

# # Collapse duplicates to monthly means (if duplicates exist for a month)
# monthly = (
#     df0.groupby(['year', 'month'], as_index=False)
#        .agg({'tmean_C':'mean', 'pr':'mean'})
#        .sort_values(['year','month'])
# )


df0

  df0 = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v4.csv")


Unnamed: 0,year,month,nee,tmean_C,pr,NDVI,Percent_Tree_Cover,Percent_NonVegetated,Percent_NonTree_Vegetation,snow_cover,snow_depth,NDSI_snow_cover
13230,2001,5,,1.95,24.0,0.5581,0.0,100.0,0.0,46.636637,8.730915e-02,50.000000
14695,2001,6,,11.00,33.0,0.5608,0.0,100.0,0.0,0.325456,3.390842e-04,0.000000
16594,2001,7,,12.70,138.0,0.4922,0.0,100.0,0.0,0.000000,-7.345365e-24,0.000000
18738,2001,8,,10.25,87.0,0.7397,0.0,100.0,0.0,0.024267,3.543977e-05,0.000000
20874,2001,9,,6.80,44.0,0.7130,0.0,100.0,0.0,1.153402,1.242405e-03,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
634070,2024,1,,-16.20,37.0,,0.0,100.0,0.0,95.802734,5.982955e-01,-9999.000000
636073,2024,2,,-11.55,24.0,,0.0,100.0,0.0,95.802734,6.621388e-01,63.900000
637793,2024,3,,-7.60,23.0,,0.0,100.0,0.0,95.802734,6.315695e-01,65.444444
639430,2024,4,,-6.10,29.0,,0.0,100.0,0.0,95.802734,7.083618e-01,71.142857


In [15]:
df0['Percent_Tree_Cover'].unique()

array([0.])