In [1]:
import pandas as pd
### use xarray for extracting temperature data from .nc files
import xarray as xr 
import numpy as np
import geopandas as gpd
import datetime
import os
import metpy.calc as mpcalc
from metpy.units import units
import rioxarray
import zipfile
import os
# Directory to extract files
extract_dir = '/Users/shivyucel/Documents/projects/DPhil/Code_Data/data/dewpoint/france/extracted'
gdf = gpd.read_file('/Users/shivyucel/Documents/projects/DPhil/Code_Data/data/adm0_shp/france/clean_shp.shp')

In [2]:

# Ensure the directory exists
os.makedirs(extract_dir, exist_ok=True)


# Unzip the NetCDF files
for year in ['2018', '2019', '2020', '2021', '2022']:
    zip_path = f'/Users/shivyucel/Documents/projects/DPhil/Code_Data/data/dewpoint/france/unextracted/era5_land_dewpoint_us_{year}.netcdf.zip'
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
                # Rename extracted files
        for file_name in zip_ref.namelist():
            if file_name.endswith('.nc'):
                old_file_path = os.path.join(extract_dir, file_name)
                new_file_path = os.path.join(extract_dir, f'era5_land_dewpoint_us_{year}.nc')
                os.rename(old_file_path, new_file_path)

# Unzip leftover_months NetCDF files
for year in ['2018', '2019', '2020', '2021', '2022']:
    zip_path = f'/Users/shivyucel/Documents/projects/DPhil/Code_Data/data/dewpoint/france/unextracted/era5_land_dewpoint_us_{year}_xtra.netcdf.zip'
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
                # Rename extracted files
        for file_name in zip_ref.namelist():
            if file_name.endswith('.nc'):
                old_file_path = os.path.join(extract_dir, file_name)
                new_file_path = os.path.join(extract_dir, f'era5_land_dewpoint_us_{year}_xtra.nc')
                os.rename(old_file_path, new_file_path)            



In [3]:
data_arrays = []
for year in ['2018', '2019', '2020', '2021', '2022']:

    nc_file = os.path.join(extract_dir, f'era5_land_dewpoint_us_{year}.nc')
    ds_main = xr.open_dataset(nc_file)

    nc_file_xtra = os.path.join(extract_dir, f'era5_land_dewpoint_us_{year}_xtra.nc')
    ds_xtra = xr.open_dataset(nc_file_xtra)

  

    ds = xr.concat([ds_main, ds_xtra], dim='valid_time')
    ds = ds.sortby('valid_time')

    # Rename 'valid_time' to 'time'
    ds = ds.rename({'valid_time': 'time'})
    
    # Convert time to the desired timezone (e.g., 'France')
    ds['time'] = ds['time'] + pd.Timedelta(hours=2)

    # Convert from Kelvin to Celsius
    ds['t2m'] = ds['t2m'] - 273.15
    ds['d2m'] = ds['d2m'] - 273.15

    # Calculate relative humidity
    ds['rh'] = mpcalc.relative_humidity_from_dewpoint(ds['t2m'] * units.degC, ds['d2m'] * units.degC)
    ds['rh'] = ds['rh'] * 100

    # Calculate THI using metpy's heat_index function
    ds['thi'] = mpcalc.heat_index(ds['t2m'] * units.degC, ds['rh'] * units.percent, mask_undefined=False)


    daily_max_thi = ds.resample(time='1D').max()

    data_arrays.append(daily_max_thi)

    print('appended')

    
concat_daily_max = xr.concat(data_arrays, dim='time')
concat_daily_max.to_netcdf('/Users/shivyucel/Documents/projects/DPhil/Code_Data/data/THI/france/array/concat_array.nc')

appended
appended
appended
appended
appended


In [4]:
concat_daily_max = xr.open_dataset('/Users/shivyucel/Documents/projects/DPhil/Code_Data/data/THI/france/array/concat_array.nc')
concat_daily_max.rio.write_crs(4326, inplace=True)

In [7]:
gdf.rename(columns={'name_1': 'adm1'}, inplace=True)

In [8]:

data_dic = {}

for idx, row in gdf.iterrows():

    adm2 = row['adm2']
    adm1 = row['adm1']
    # Clip the THI DataArray using the geometry
    try:
        clipped_thi = concat_daily_max.rio.clip([row.geometry], all_touched=True, crs=gdf.crs)
        
        # Convert the clipped THI DataArray to a DataFrame
        clipped_thi_df = clipped_thi.to_dataframe().reset_index()
        
        # Add the adm2 information to the DataFrame

        clipped_thi_df = clipped_thi_df.groupby('time').mean(numeric_only=True).reset_index()

        clipped_thi_df['adm2'] = adm2
        
        clipped_thi_df['adm1'] = adm1

        data_dic[f'{adm1}_{adm2}'] = clipped_thi_df

        print(f'{adm1}_{adm2} appended')

    except:
        continue

    


Alsace_Bas-Rhin appended
Alsace_Haut-Rhin appended
Aquitaine_Dordogne appended
Aquitaine_Gironde appended
Aquitaine_Landes appended
Aquitaine_Lot-et-Garonne appended
Aquitaine_Pyrénées-Atlantiques appended
Auvergne_Allier appended
Auvergne_Cantal appended
Auvergne_Haute-Loire appended
Auvergne_Puy-de-Dôme appended
Île-de-France_Essonne appended
Île-de-France_Hauts-de-Seine appended
Île-de-France_Paris appended
Île-de-France_Seine-et-Marne appended
Île-de-France_Seine-Saint-Denis appended
Île-de-France_Val-d'Oise appended
Île-de-France_Val-de-Marne appended
Île-de-France_Yvelines appended
Basse-Normandie_Calvados appended
Basse-Normandie_Manche appended
Basse-Normandie_Orne appended
Bourgogne_Côte-d'Or appended
Bourgogne_Nièvre appended
Bourgogne_Saône-et-Loire appended
Bourgogne_Yonne appended
Bretagne_Côtes-d'Armor appended
Bretagne_Finistère appended
Bretagne_Ille-et-Vilaine appended
Bretagne_Morbihan appended
Centre_Cher appended
Centre_Eure-et-Loir appended
Centre_Indre-et-Loire ap

In [9]:
final_concat = pd.concat(data_dic.values(), ignore_index=True)

In [10]:
final_concat.drop(columns=['latitude', 'longitude'], inplace=True)

In [11]:
final_concat

Unnamed: 0,time,number,spatial_ref,t2m,d2m,rh,thi,adm2,adm1
0,2018-05-01,0.0,0.0,12.984543,3.212509,72.967628,11.467585,Bas-Rhin,Alsace
1,2018-05-02,0.0,0.0,16.846756,8.483494,82.404953,15.875832,Bas-Rhin,Alsace
2,2018-05-03,0.0,0.0,16.842527,7.163579,81.988007,15.847073,Bas-Rhin,Alsace
3,2018-05-04,0.0,0.0,20.003017,9.834668,72.436050,19.402456,Bas-Rhin,Alsace
4,2018-05-05,0.0,0.0,22.701967,11.183942,65.621696,22.267851,Bas-Rhin,Alsace
...,...,...,...,...,...,...,...,...,...
74395,2022-09-28,0.0,0.0,7.894247,6.730359,97.352921,7.502293,Savoie,Rhône-Alpes
74396,2022-09-29,0.0,0.0,6.146764,4.305479,97.177002,5.682936,Savoie,Rhône-Alpes
74397,2022-09-30,0.0,0.0,6.184298,3.022592,97.056587,5.607690,Savoie,Rhône-Alpes
74398,2022-10-01,0.0,0.0,7.066686,6.117462,96.861404,6.652348,Savoie,Rhône-Alpes


In [12]:
import seaborn as sns

In [14]:
len(final_concat['adm2'].unique())

96

In [15]:
final_concat.to_csv('/Users/shivyucel/Documents/projects/DPhil/Code_Data/data/THI/france/daily_max_THI.csv')