In [154]:
import pandas as pd
import numpy as np
zwd_df = pd.read_csv('/gnn/rrr/integrated_weather_dataset/data/processed/Troposphere/2004.csv')
guan_df = pd.read_csv('/gnn/rrr/integrated_weather_dataset/data/processed/Guan/2004.csv')
rutz_df = pd.read_csv('/gnn/rrr/integrated_weather_dataset/data/processed/Rutz/2004.csv')

In [155]:
zwd_df

Unnamed: 0,Timestamp,Site,Latitude,Longitude,ZWD
0,2004-01-01 00:00:00.000000,AGMT,34.594282,-116.429377,27.7
1,2004-01-01 00:04:59.180331,AGMT,34.594282,-116.429377,28.3
2,2004-01-01 00:09:58.360654,AGMT,34.594282,-116.429377,28.7
3,2004-01-01 00:14:57.540985,AGMT,34.594282,-116.429377,29.2
4,2004-01-01 00:19:56.721309,AGMT,34.594282,-116.429377,29.6
...,...,...,...,...,...
139600,2004-01-02 23:37:10.327872,WWMT,33.955313,-116.653855,111.0
139601,2004-01-02 23:42:09.508196,WWMT,33.955313,-116.653855,111.2
139602,2004-01-02 23:47:08.688527,WWMT,33.955313,-116.653855,111.4
139603,2004-01-02 23:52:07.868850,WWMT,33.955313,-116.653855,111.6


In [57]:
guan_df

Unnamed: 0,Timestamp,Latitude,Longitude,Guan_AR_Label
0,2004-01-01,38.0,-120.000,0
1,2004-01-01,38.0,-119.375,0
2,2004-01-01,38.0,-118.750,0
3,2004-01-01,38.0,-118.125,0
4,2004-01-01,38.0,-117.500,0
...,...,...,...,...
368041,2004-12-31,31.5,-117.500,1
368042,2004-12-31,31.5,-116.875,1
368043,2004-12-31,31.5,-116.250,1
368044,2004-12-31,31.5,-115.625,1


In [152]:
rutz_df = rutz_df.rename(columns={
    'longitude': 'Longitude', 
    'latitude': 'Latitude',
    'ARs': 'Rutz_AR_Label'
})
rutz_df

Unnamed: 0,Timestamp,Longitude,Latitude,IVT,Rutz_AR_Label
0,2004-01-01 00:00:00,-120.0,31.5,67.215446,0.0
1,2004-01-01 03:00:00,-120.0,31.5,65.255585,0.0
2,2004-01-01 06:00:00,-120.0,31.5,45.925446,0.0
3,2004-01-01 09:00:00,-120.0,31.5,33.749409,0.0
4,2004-01-01 12:00:00,-120.0,31.5,36.337650,0.0
...,...,...,...,...,...
368923,2004-12-31 09:00:00,-115.0,38.0,208.992188,0.0
368924,2004-12-31 12:00:00,-115.0,38.0,235.344757,0.0
368925,2004-12-31 15:00:00,-115.0,38.0,221.132523,0.0
368926,2004-12-31 18:00:00,-115.0,38.0,223.059464,0.0


In [58]:
zwd_df["Timestamp"] = pd.to_datetime(zwd_df["Timestamp"])
guan_df["Timestamp"] = pd.to_datetime(guan_df["Timestamp"])


In [136]:
df1 = zwd_df[0:10000]
df2 = guan_df[0:10000]

df1["Timestamp"] = (
        df1["Timestamp"]
        .dt.floor('5T')
        .dt.round('5T')
    )

NameError: name 'df_rounded' is not defined

In [141]:
LAT_TOL = 0.25
LON_TOL = 0.3125
TIME_TOLERANCE = pd.Timedelta(seconds = 120)
CLOSEST_TIME_WINDOW = pd.Timedelta(hours=3)


In [142]:
def precompute_spatial_matches(zwd_df, guan_df):
    spatial_matches = {}
    for _, site_row in zwd_df.iterrows():
        site_id = site_row["Site"]
        lat1, lon1 = site_row["Latitude"], site_row["Longitude"]
        spatial_mask = (
            (guan_df["Latitude"] >= lat1 - LAT_TOL)
            & (guan_df["Latitude"] <= lat1 + LAT_TOL)
            & (guan_df["Longitude"] >= lon1 - LON_TOL)
            & (guan_df["Longitude"] <= lon1 + LON_TOL)
        )
        spatial_matches[site_id] = guan_df[spatial_mask].reset_index(drop=True)
    return spatial_matches


In [143]:
def process_site_day(site, zwd_site_df, ar_site_df):
    exact_match_labels = []
    closest_match_labels = []

    for _, zwd_row in zwd_site_df.iterrows():
        time1 = zwd_row["Timestamp"]

        exact_label = np.nan
        closest_label = 0

        if not ar_site_df.empty:
            time_diffs = abs(ar_site_df["Timestamp"] - time1)
            exact_matches = ar_site_df[time_diffs <= TIME_TOLERANCE]

            closest_match = None
            if not exact_matches.empty:
                exact_label = exact_matches.iloc[0]["Guan_AR_Label"]
            else:
                time_diffs_within_window = time_diffs[time_diffs <= CLOSEST_TIME_WINDOW]
                if not time_diffs_within_window.empty:
                    closest_index = time_diffs_within_window.idxmin()
                    closest_match = ar_site_df.loc[closest_index]
                    closest_label = closest_match["Guan_AR_Label"]

        exact_match_labels.append(exact_label)
        closest_match_labels.append(closest_label)

    if len(exact_match_labels) == len(zwd_site_df):
        zwd_site_df["Guan_exact_match_label"] = exact_match_labels
        zwd_site_df["Guan_Label"] = closest_match_labels
    else:
        raise ValueError("Length of labels does not match the number of rows in the site dataframe.")

    return zwd_site_df


In [144]:
def process_guans(zwd_df, ar_df):
    zwd_sites_df = zwd_df[["Site", "Latitude", "Longitude"]].drop_duplicates()

    ar_spatial_matches = precompute_spatial_matches(zwd_sites_df, ar_df)
    zwd_df["Day"] = zwd_df["Timestamp"].dt.date
    ar_df["Day"] = ar_df["Timestamp"].dt.date
    all_results = []
    for site_id in zwd_sites_df["Site"].unique():
        print(f"Processing site: {site_id}")

        zwd_site_df = zwd_df[zwd_df["Site"] == site_id].reset_index(drop=True)
        ar_site_df = ar_spatial_matches.get(site_id, pd.DataFrame())

        for day in zwd_site_df["Day"].unique():
            zwd_day_df = zwd_site_df[zwd_site_df["Day"] == day].reset_index(drop=True)

            labeled_day_df = process_site_day(site_id, zwd_day_df, ar_site_df)
            all_results.append(labeled_day_df)

            print(f"Processed day: {day} for site: {site_id}")
    final_result = pd.concat(all_results, ignore_index=True)

    print(f"labeled data done")
    return final_result


In [145]:
final_result = process_guans(df1,df2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zwd_df["Day"] = zwd_df["Timestamp"].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ar_df["Day"] = ar_df["Timestamp"].dt.date


Processing site: AGMT
Processed day: 2004-01-01 for site: AGMT
Processing site: ALAM
Processed day: 2004-01-01 for site: ALAM
Processing site: ALPP
Processed day: 2004-01-01 for site: ALPP
Processing site: AOA1
Processed day: 2004-01-01 for site: AOA1
Processing site: ARGU
Processed day: 2004-01-01 for site: ARGU
Processing site: ARM1
Processed day: 2004-01-01 for site: ARM1
Processing site: ARM2
Processed day: 2004-01-01 for site: ARM2
Processing site: AVRY
Processed day: 2004-01-01 for site: AVRY
Processing site: AZRY
Processed day: 2004-01-01 for site: AZRY
Processing site: BALD
Processed day: 2004-01-01 for site: BALD
Processing site: BBDM
Processed day: 2004-01-01 for site: BBDM
Processing site: BBRY
Processed day: 2004-01-01 for site: BBRY
Processing site: BCWR
Processed day: 2004-01-01 for site: BCWR
Processing site: BEAT
Processed day: 2004-01-01 for site: BEAT
Processing site: BEMT
Processed day: 2004-01-01 for site: BEMT
Processing site: BEPK
Processed day: 2004-01-01 for sit

In [148]:
dff = final_result[final_result['Guan_exact_match_label'] == 0]