In [1]:
import os, sys
import numpy as np
import xarray as xr
import pandas as pd

In [2]:
filename =  '/gnn/rrr/integrated_weather_dataset/data/raw/Guan_AR_Catalog/globalARcatalog_MERRA2_1980-2023_v4.0.nc'
ds = xr.open_dataset(filename, chunks={'time': 1460}, engine='netcdf4')
ds = ds.squeeze()
ds = ds.reset_coords(names=['lev', 'ens'], drop=True)

  ds = xr.open_dataset(filename, chunks={'time': 1460}, engine='netcdf4')


In [3]:
# Step 2: Select the relevant geographical region
MIN_LON = -120+360
MAX_LON = -115+360
MIN_LAT = 31.5
MAX_LAT = 38
start_date = '2016-01-01T00:00:00.000000000'
end_date = '2016-12-31T00:00:00.000000000'
ds = ds.sel(time=slice(start_date, end_date), lat=slice(MIN_LAT,MAX_LAT), lon=slice(MIN_LON,MAX_LON))
print(f"Data selected for date : {start_date} to {end_date} lat: {MIN_LAT} to {MAX_LAT} and lon: {MIN_LON} to {MAX_LON}")


Data selected for date : 2016-01-01T00:00:00.000000000 to 2016-12-31T00:00:00.000000000 lat: 31.5 to 38 and lon: 240 to 245


In [4]:
df = ds.shapemap.to_dataframe(dim_order=['time', 'lat', 'lon']).dropna()
df = df.reset_index()
df['time'] = pd.to_datetime(df['time'])
df['Guan_AR_Label'] = df['shapemap'].notna().astype(int)
all_times = pd.date_range(start=start_date, end=end_date, freq='3h') 
all_combinations = pd.MultiIndex.from_product(
    [all_times, df['lat'].unique(), df['lon'].unique()],
    names=['time', 'lat', 'lon']
).to_frame(index=False)
print(f"Generated all combinations of time, latitude, and longitude: {len(all_combinations)} rows")


Generated all combinations of time, latitude, and longitude: 368046 rows


In [5]:
merged_df = pd.merge(
    all_combinations,
    df,
    on=['time', 'lat', 'lon'],
    how='left'
).fillna({'Guan_AR_Label': 0})
print("Data merged with all combinations.")
merged_df.rename(columns={'time': 'Timestamp', 'lat': 'Latitude', 'lon': 'Longitude'}, inplace=True)
merged_df['Longitude'] = ((merged_df['Longitude'] + 180) % 360) - 180
df_final = merged_df[['Timestamp', 'Latitude', 'Longitude', 'Guan_AR_Label']]


Data merged with all combinations.


In [6]:
df_final

Unnamed: 0,Timestamp,Latitude,Longitude,Guan_AR_Label
0,2016-01-01,31.5,-120.000,0.0
1,2016-01-01,31.5,-119.375,0.0
2,2016-01-01,31.5,-118.750,0.0
3,2016-01-01,31.5,-118.125,0.0
4,2016-01-01,31.5,-117.500,0.0
...,...,...,...,...
368041,2016-12-31,38.0,-117.500,0.0
368042,2016-12-31,38.0,-116.875,0.0
368043,2016-12-31,38.0,-116.250,0.0
368044,2016-12-31,38.0,-115.000,0.0


In [7]:
df2 = pd.read_csv('/gnn/rrr/ES3-TACLS/AR/dataset/labels/2016.csv', index_col = False)
df2['time'] = pd.to_datetime(df2['time'])
all_times_2 = pd.date_range(start=start_date, end=end_date, freq='3h')  # 3-hour intervals
all_combinations_2 = pd.MultiIndex.from_product(
    [all_times_2, df2['lat'].unique(), df2['lon'].unique()],
    names=['time', 'lat', 'lon']
).to_frame(index=False)

In [8]:
print(f"Generated all combinations of time, latitude, and longitude: {len(all_combinations_2)} rows")
all_combinations_2['Guan_AR_Label'] = 0
concat_df = pd.concat([all_combinations_2, df2], ignore_index=True)
result_df = concat_df.drop_duplicates(subset=['time', 'lat', 'lon'], keep='last')
result_df['Guan_AR_Label'] = result_df['Guan_AR_Label'].fillna(0)

Generated all combinations of time, latitude, and longitude: 368046 rows


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['Guan_AR_Label'] = result_df['Guan_AR_Label'].fillna(0)


In [9]:
result_df

Unnamed: 0.1,time,lat,lon,Guan_AR_Label,Unnamed: 0,Rutz_AR_Label
0,2016-01-01,31.5,-120.000,0,,
1,2016-01-01,31.5,-119.375,0,,
2,2016-01-01,31.5,-118.750,0,,
3,2016-01-01,31.5,-118.125,0,,
4,2016-01-01,31.5,-117.500,0,,
...,...,...,...,...,...,...
402691,2016-12-31,34.0,-115.000,1,34645.0,0.0
402692,2016-12-31,34.5,-115.000,1,34646.0,0.0
402693,2016-12-31,35.0,-115.000,1,34647.0,0.0
402694,2016-12-31,35.5,-115.625,1,34648.0,0.0


In [10]:
df_final['Guan_AR_Label'].value_counts()


Guan_AR_Label
0.0    352563
1.0     15483
Name: count, dtype: int64

In [11]:
result_df['Guan_AR_Label'].value_counts()

Guan_AR_Label
0    335368
1     32678
Name: count, dtype: int64

In [12]:
df_final['Guan_AR_Label'] = result_df['Guan_AR_Label']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Guan_AR_Label'] = result_df['Guan_AR_Label']


In [13]:
df_final.reset_index(drop=True, inplace=True)
result_df.reset_index(drop=True, inplace=True)

df_final['Guan_AR_Label'] = result_df['Guan_AR_Label']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Guan_AR_Label'] = result_df['Guan_AR_Label']


In [14]:
result_df['Guan_AR_Label'].value_counts()

Guan_AR_Label
0    335368
1     32678
Name: count, dtype: int64

In [None]:
result_df.to_csv('