In [7]:
import pandas as pd
### use xarray for extracting temperature data from .nc files
import xarray as xr 
import numpy as np
import geopandas as gpd
import datetime
import os
import metpy.calc as mpcalc
from metpy.units import units
import rioxarray
import zipfile
import os
# Directory to extract files
extract_dir = '/Users/liangyuan/Desktop/ERA5_heatwave/extracted'

#Geometry of China
gdf = gpd.read_file('china_city.geojson')

## Preprocess

### Step 1: Get daily maximum THI values at each pixel cell

In [11]:
#Concatenate data from all 5 years along the time axis
#Change the time zone
#Convert temperature and dewpoint to Celcius
#Calculate relative humidity using metpy package
#Calculate heat index using metpy

# Resample to get the DAILY MAXIMUM heat index value at each pixel cell -- 'concat_daily_max'

In [13]:
data_arrays = []

for year in ['2019', '2020', '2021', '2022','2023']:

    #May to September files
    nc_file = os.path.join(extract_dir, f'era5_land_dewpoint_China_{year}.nc')
    ds_main = xr.open_dataset(nc_file,engine='h5netcdf')

    #Extra April day
    nc_file_04 = os.path.join(extract_dir, f'era5_land_dewpoint_China_{year}_04.nc')
    ds_04 = xr.open_dataset(nc_file_04,engine='h5netcdf')

    #extra October day
    nc_file_10 = os.path.join(extract_dir, f'era5_land_dewpoint_China_{year}_10.nc')
    ds_10 = xr.open_dataset(nc_file_10,engine='h5netcdf')

    #concatenate
    ds = xr.concat([ds_main, ds_04, ds_10], dim='valid_time')
    ds = ds.sortby('valid_time')

    # Rename 'valid_time' to 'time'
    ds = ds.rename({'valid_time': 'time'})
    
    # Convert time to the desired timezone (e.g., 'China/Beijing')
    ds['time'] = ds['time'] + pd.Timedelta(hours=8)

    # Convert from Kelvin to Celsius
    ds['t2m'] = ds['t2m'] - 273.15
    ds['d2m'] = ds['d2m'] - 273.15

    # Calculate relative humidity
    ds['rh'] = mpcalc.relative_humidity_from_dewpoint(ds['t2m'] * units.degC, ds['d2m'] * units.degC)
    ds['rh'] = ds['rh'] * 100

    # Calculate THI using metpy's heat_index function
    ds['thi'] = mpcalc.heat_index(ds['t2m'] * units.degC, ds['rh'] * units.percent, mask_undefined=False)


    daily_max_thi = ds.resample(time='1D').max()

    data_arrays.append(daily_max_thi)

    print('appended')

    
concat_daily_max = xr.concat(data_arrays, dim='time')
concat_daily_max.to_netcdf('concat_array.nc')

appended
appended
appended
appended
appended


In [None]:
#Add coordinate projection to raster file

In [17]:
concat_daily_max = xr.open_dataset('concat_array.nc',engine='h5netcdf')
concat_daily_max.rio.write_crs(4326, inplace=True)

In [19]:
# For each location in GeoDataFrame of country
# Clip all pixel cells with intersect with shape of the location
# Get the mean value at each time period (the daily maximums calculated in step 1)
# Output is the mean of the daily maximums on each day for each county
# saved into a dictionary

### Step 2: Clip all pixels with intersect with each location, get the mean of those daily maximum values

In [30]:

data_dic = {}

for idx, row in gdf.iterrows():

    adm2 = row['adm2']
    adm1 = row['adm1']
    # Clip the THI DataArray using the geometry
    try:
        clipped_thi = concat_daily_max.rio.clip([row.geometry], all_touched=True, crs=gdf.crs)
        
        # Convert the clipped THI DataArray to a DataFrame
        clipped_thi_df = clipped_thi.to_dataframe().reset_index()
        
        # Add the adm2 information to the DataFrame

        clipped_thi_df = clipped_thi_df.groupby('time').mean(numeric_only=True).reset_index()

        clipped_thi_df['adm2'] = adm2
        
        clipped_thi_df['adm1'] = adm1

        data_dic[f'{adm1}_{adm2}'] = clipped_thi_df

        print(f'{adm1}_{adm2} appended')

    except:
        continue

    


beijingshi_beijingshi appended
tianjinshi_tianjinshi appended
hebeisheng_shijiazhuangshi appended
hebeisheng_tangshanshi appended
hebeisheng_qinhuangdaoshi appended
hebeisheng_handanshi appended
hebeisheng_xingtaishi appended
hebeisheng_baodingshi appended
hebeisheng_zhangjiakoushi appended
hebeisheng_chengdeshi appended
hebeisheng_cangzhoushi appended
hebeisheng_langfangshi appended
hebeisheng_hengshuishi appended
shanxisheng_taiyuanshi appended
shanxisheng_datongshi appended
shanxisheng_yangquanshi appended
shanxisheng_zhangzhishi appended
shanxisheng_jinchengshi appended
shanxisheng_shuozhoushi appended
shanxisheng_jinzhongshi appended
shanxisheng_yunchengshi appended
shanxisheng_xinzhoushi appended
shanxisheng_linfenshi appended
shanxisheng_lvliangshi appended
neimengguzizhiqu_huhehaoteshi appended
neimengguzizhiqu_baotoushi appended
neimengguzizhiqu_wuhaishi appended
neimengguzizhiqu_chifengshi appended
neimengguzizhiqu_tongliaoshi appended
neimengguzizhiqu_eerduosishi appended
ne

In [None]:
#Convert dictionary to dataframe

In [32]:
final_concat = pd.concat(data_dic.values(), ignore_index=True)

In [34]:
final_concat.drop(columns=['latitude', 'longitude'], inplace=True)

In [46]:
final_concat.groupby('adm2')['thi'].mean().sort_values()

adm2
alidiqu                  7.385272
yushuzangzuzizhizhou     7.634543
naqushi                  7.936676
rikazeshi                8.366720
lasashi                  9.156159
                          ...    
wenchangshi             38.389867
lingaoxian              38.476631
dinganxian              38.561873
haikoushi               38.712540
sanshashi                     NaN
Name: thi, Length: 375, dtype: float64

In [40]:
#Save as file

In [42]:
final_concat.to_csv('daily_max_THI.csv')