In [7]:
import os
import geopandas as gpd
import xarray as xr
import pandas as pd
import rioxarray

# Define paths
shapefile_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_sites.shp"
netcdf_dir = "/explore/nobackup/people/spotter5/anna_v/v2/ALT"

# Load shapefile
gdf = gpd.read_file(shapefile_path)

# Optional: Rename for consistency
gdf = gdf.rename(columns={"site_refer": "site_reference"}).dropna().reset_index()

# Initialize results list
records = []

# Loop through NetCDF files
for file_name in sorted(os.listdir(netcdf_dir)):
    if not file_name.endswith(".nc"):
        continue

    # Extract year from filename
    year = int(file_name.split("PP-")[1].split("-")[0])
    file_path = os.path.join(netcdf_dir, file_name)

    # Open dataset and select ALT variable
    ds = xr.open_dataset(file_path)
    alt = ds["ALT"].squeeze()  # remove time dimension
    # Write CRS explicitly (lat/lon grid)
    alt.rio.write_crs("EPSG:4326", inplace=True)

    # Reproject geometry (only) to match raster CRS
    gdf_proj = gdf.copy()
    gdf_proj["geometry"] = gdf_proj["geometry"].to_crs("EPSG:4326")

    # Loop through each site
    for idx, row in gdf_proj.iterrows():
        lat = row.geometry.y
        lon = row.geometry.x

        try:
            val = alt.sel(lat=lat, lon=lon, method="nearest").values.item()
        except Exception:
            val = None

        records.append({
            "site_reference": gdf.iloc[idx]["site_reference"],
            "year": year,
            "ALT": val
        })

# Convert to DataFrame and save
df = pd.DataFrame(records)
df.to_csv("/explore/nobackup/people/spotter5/anna_v/v2/ALT_by_site.csv", index=False)


In [4]:
input_data = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data.csv")
soil = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/integrated_soil_data_v2_sites.csv")

input_data = input_data.merge(df, on=["site_reference", "year"], how="left")

input_data.to_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_alt.csv", index = False)

# Keep only columns that end with '100cm' plus site_refer
soil_filtered = soil.filter(regex='100cm$').copy()
soil_filtered["site_reference"] = soil["site_refer"]  # Rename key column
input_data = input_data.merge(soil_filtered, on="site_reference", how="left")
input_data.to_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_alt_soil.csv", index = False)


In [None]:
import pandas as pd

# --- Load all data sources ---
input_data = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v2.csv")
soil = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/integrated_soil_data_1km_v2_sites.csv")
landcover = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/extracted_landcover_values.csv")
cont = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/co2_cont.csv")
alt = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/ALT_by_site.csv")

# --- Initial Data Cleaning ---
input_data = input_data[input_data['flux_method'] == 'EC']
input_data = input_data.dropna(subset=['site_reference'])
soil = soil.dropna(subset=['site_refer'])
landcover = landcover.dropna(subset=['site_refer'])

input_data = input_data.drop_duplicates(subset=['site_reference', 'year', 'month'])
soil = soil.drop_duplicates(subset=['site_refer'])
landcover = landcover.drop_duplicates(subset=['site_refer'])

print(f"Initial shape: {input_data.shape}")

# --- Prepare and Merge Soil Data ---
soil_filtered = soil.filter(regex='100cm$').copy()
soil_filtered["site_reference"] = soil["site_refer"]
input_data = input_data.merge(soil_filtered, on="site_reference", how="left")

print(f"After soil merge: {input_data.shape}")

# --- Prepare and Merge Land Cover Data ---
landcover = landcover.rename(columns={'site_refer': 'site_reference'})
landcover = landcover[['site_reference', 'land_cover']]
input_data = input_data.merge(landcover, on="site_reference", how="left")

print(f"After landcover merge: {input_data.shape}")

# --- Prepare and Merge CO2 Data ---
co2_to_merge = cont[['year', 'month', 'value']].copy()
co2_to_merge = co2_to_merge.rename(columns={'value': 'co2_cont'})
co2_to_merge = co2_to_merge.drop_duplicates(subset=['year', 'month'])
input_data = input_data.merge(co2_to_merge, on=['year', 'month'], how='left')

print(f"After CO2 merge: {input_data.shape}")

# --- Prepare and Merge ALT Data ---
# Select relevant columns and drop duplicates
alt_to_merge = alt[['site_reference', 'year', 'ALT']].copy()
alt_to_merge = alt_to_merge.drop_duplicates(subset=['site_reference', 'year'])

# Merge ALT data into the main dataframe on 'site_reference' and 'year'
input_data = input_data.merge(alt_to_merge, on=['site_reference', 'year'], how='left')

print(f"After ALT merge: {input_data.shape}")

# --- Save Final Combined Data ---
output_path_final = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_alt_soil_lc_co2.csv"
input_data.to_csv(output_path_final, index=False)

print(f"\nSuccessfully merged all data and saved to: {output_path_final}")
print("Final DataFrame head:")
print(input_data.head())
print("\nFinal DataFrame columns:")
print(input_data.columns)

In [1]:
import pandas as pd
alt = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/ALT_by_site.csv")
alt.head()

Unnamed: 0,site_reference,year,ALT
0,Fyodorovskoye_RU-Fyo_tower,1997,
1,"Saskatchewan - Western Boreal, Mature Aspen_CA...",1997,
2,"Saskatchewan - Western Boreal, Mature Jack Pin...",1997,
3,Flakaliden_SE-Fla_tower,1997,
4,Hyytiala_FI-Hyy_tower,1997,


In [15]:
duplicate_keys = soil_filtered['site_reference'].value_counts()

print("Sites with more than one entry in 'soil_filtered':")
print(duplicate_keys[duplicate_keys > 1])

t = soil_filtered[soil_filtered['site_reference'] == 'Yakutsk Spasskaya Pad larch_RU-SkP_tower']

t

Sites with more than one entry in 'soil_filtered':
site_reference
Yakutsk Spasskaya Pad larch_RU-SkP_tower                        3
Saskatchewan - Western Boreal, Mature Jack Pine_CA-Ojp_tower    3
Zackenberg Heath_GL-ZaH_tower                                   2
Stordalen_StordalenSE2_agg_chamber                              2
Kilpisjarvi_chamber_505_chamber                                 2
                                                               ..
Kilpisjarvi_chamber_523_chamber                                 2
Storflaket_StorflaketDS1_agg_chamber                            2
Kilpisjarvi_chamber_247_chamber                                 2
Kilpisjarvi_chamber_11214_chamber                               2
Kilpisjarvi_chamber_37_chamber                                  2
Name: count, Length: 177, dtype: int64


Unnamed: 0,bdod_0_100cm,cec_0_100cm,cfvo_0_100cm,clay_0_100cm,nitrogen_0_100cm,ocd_0_100cm,phh2o_0_100cm,sand_0_100cm,silt_0_100cm,soc_0_100cm,site_reference
681,1.347312,28.282642,15.685455,26.354964,40.955082,19.833244,7.628313,44.511877,29.123439,69.361875,Yakutsk Spasskaya Pad larch_RU-SkP_tower
880,1.347312,28.282642,15.685455,26.354964,40.955082,19.833244,7.628313,44.511877,29.123439,69.361875,Yakutsk Spasskaya Pad larch_RU-SkP_tower
1403,1.352656,29.036796,18.118785,19.172171,31.066944,19.915225,6.971674,49.530927,31.296303,70.418125,Yakutsk Spasskaya Pad larch_RU-SkP_tower


In [5]:
import pandas as pd

# Load your data
df = pd.read_csv("/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_data_v2.csv")
df = df.dropna(subset=['site_reference'])

df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

print(f"Initial shape: {df.shape}")

# Find and display any rows with duplicate keys
duplicate_rows = df[df.duplicated(subset=['site_reference', 'date'], keep=False)]

if not duplicate_rows.empty:
    print("\n🚨 Found duplicate rows that will cause the merge to add rows:")
    # Print the first few duplicate sets to see the problem
    print(duplicate_rows.sort_values(['site_reference', 'date']).head(10))
else:
    print("\n✅ No duplicate rows found.")

Initial shape: (18928, 39)

✅ No duplicate rows found.


In [8]:
landcover

Unnamed: 0,site_refer,latitude,longitude,land_cover
0,APEX Beta_Active Margin_AMCH1_agg_chamber,64.696183,-148.320086,0
1,APEX Beta_Active Margin_AMCH2_agg_chamber,64.696183,-148.320086,0
2,APEX Beta_Active Margin_AMCH3_agg_chamber,64.696183,-148.320086,0
3,APEX Beta_Permafrost Plateau_PPC1_agg_chamber,64.696344,-148.321881,0
4,APEX Beta_Permafrost Plateau_PPC2_agg_chamber,64.696344,-148.321881,0
...,...,...,...,...
2144,Zackenberg Heath_GL-ZaH_tower,74.473280,-20.550300,4
2145,Zackenberg Heath_GL-ZaH_tower,74.473299,-20.550869,4
2146,Zotino_RU-Zot_tower,60.800800,89.350700,0
2147,Zotino; Central Siberia_RU-Zfw 1_tower,60.750000,89.380000,0


Send one file to tif for viewing in arc

In [2]:
import os
import xarray as xr
import rioxarray  # This import activates the .rio accessor on xarray objects

# --- Configuration ---

# 1. Directory where your source NetCDF files are located
netcdf_dir = "/explore/nobackup/people/spotter5/anna_v/v2/ALT"

# 2. Directory to save the new GeoTIFF file
output_dir = "/explore/nobackup/people/spotter5/anna_v/v2/geotiffs"
os.makedirs(output_dir, exist_ok=True)

# 3. The specific year you want to convert to a .tif file
target_year = 2020

# --- Find the Correct File ---

target_filename = None
for fname in os.listdir(netcdf_dir):
    if fname.endswith(".nc") and f"PP-{target_year}" in fname:
        target_filename = fname
        break

# --- Process and Convert the File ---

if target_filename:
    netcdf_path = os.path.join(netcdf_dir, target_filename)
    output_tif_path = os.path.join(output_dir, f"ALT_{target_year}.tif")
    
    print(f"Processing {netcdf_path}...")

    # Open the NetCDF dataset
    ds = xr.open_dataset(netcdf_path)
    
    # Select the 'ALT' variable and remove singleton dimensions
    alt_data = ds["ALT"].squeeze(drop=True)

    # FIX: Manually remove the conflicting 'grid_mapping' attribute
    if 'grid_mapping' in alt_data.attrs:
        del alt_data.attrs['grid_mapping']
    
    # Assign the correct Coordinate Reference System (CRS)
    alt_data.rio.write_crs("EPSG:4326", inplace=True)
    
    # Write the xarray DataArray to a GeoTIFF file
    alt_data.rio.to_raster(output_tif_path)
    
    print(f"Successfully created GeoTIFF: {output_tif_path} ✨")

else:
    print(f"❌ Error: No file found for the year {target_year} in the directory.")

Processing /explore/nobackup/people/spotter5/anna_v/v2/ALT/ESACCI-PERMAFROST-L4-ALT-MODISLST_CRYOGRID-AREA4_PP-2020-fv04.0.nc...
Successfully created GeoTIFF: /explore/nobackup/people/spotter5/anna_v/v2/geotiffs/ALT_2020.tif ✨
