<a href="https://colab.research.google.com/github/rjanow/Masterarbeit/blob/main/preprocessing_netCDF4_to_DF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script um die netCDF4 Vorhersagewerte in einen Dataframe abzuspeichern

**Dokumentenname: netCDF4_to_CSV.ipynb**

Es werden die Vorhersagedaten von CAMS aus dem netCDF4-Format in einen Dataframe umgewandelt.

## Allgemeine Einstelllungen:

In [1]:
# Installation der netCDF4 Bibliothek, da diese nicht standardmäßig in Google-Colab implementiert ist.
!pip install netCDF4



## Import der benötigten Module und allgemeines Setup:

In [2]:
# Import der benötigten Module
import os
import pickle
import netCDF4 as nc
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

Mount der Google-Drive:

In [3]:
from google.colab import drive

# Mounten des Google-Drive Kontos
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Zeit wird in Stunden seit 1.1.1900 00:00 angegeben
origin = dt.datetime(1900, 1, 1, 0, 0, 0, 0)

In [5]:
# Drive Ordner mit den Vorhersagewerten im NetCDF4-Format
folder_import = '/content/drive/My Drive/Colab_Notebooks/netCDF4_Wetterdaten/'

folder_export = '/content/drive/My Drive/Colab_Notebooks/CAMS_Vorhersage/'
name_export = 'CAMS_std_CSV'

## Einlesen der NetCDF4-Dateien:

In [6]:
def read_nc_files_in_google_drive(folder_path):
    nc_files = []  # Liste, um die NetCDF4-Datensätze abzuspeichern

    if os.path.exists(folder_path):
        # Durchsuchen des Drive-Ordners nach Dateien
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            # Überprüfen, ob die Datei eine NetCDF4-Datei ist
            if file_name.endswith(".nc"):
                try:
                    # Öffnen der NetCDF4-Datei und huzfügen zur Liste
                    nc_file = nc.Dataset(file_path)
                    nc_files.append(nc_file)
                    print(nc_file)
                except Exception as e:
                    print(f"Fehler beim Öffnen von {file_name}: {str(e)}")

    return nc_files

In [7]:
# Funktionsaufruf zum Einlesen der NetCDF4-Dateien
netcdf_files = read_nc_files_in_google_drive(folder_import)

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_64BIT_OFFSET data model, file format NETCDF3):
    Conventions: CF-1.6
    history: 2023-10-26 11:10:13 GMT by grib_to_netcdf-2.25.1: /opt/ecmwf/mars-client/bin/grib_to_netcdf.bin -S param -o /cache/tmp/845854f7-e797-43e6-a680-0f7b20869ac9-adaptor.mars_constrained.external-1698318597.7512727-2202-23-tmp.nc /cache/tmp/845854f7-e797-43e6-a680-0f7b20869ac9-adaptor.mars_constrained.external-1698318591.7085574-2202-22-tmp.grib
    dimensions(sizes): longitude(1), latitude(1), time(720)
    variables(dimensions): float32 longitude(longitude), float32 latitude(latitude), int32 time(time), int16 t2m(time, latitude, longitude), int16 hcc(time, latitude, longitude), int16 lcc(time, latitude, longitude), int16 sund(time, latitude, longitude), int16 ssrd(time, latitude, longitude), int16 aod469(time, latitude, longitude), int16 aod550(time, latitude, longitude), int16 tcc(time, latitude, longitude), int16 gtco3(time, latitude, longitude), int1

In [8]:
# netcdf_files

## Variablen Deklaration:

In [9]:
#Variablen Deklaration
latitudes = np.empty((1, 1))
longitudes = np.empty((1, 1))
time = np.empty((1, 1))
aod469 = np.empty((1, 1))   # Aerosol Optical Depth at 469
aod550 = np.empty((1, 1))   # Aerosol Optical Depth at 550
gtco3 = np.empty((1, 1))    # Total Ozon Column
uvbed = np.empty((1, 1))    # UVI All-Sky
uvbedcs = np.empty((1, 1))  # UVI Clear-Sky
hcc = np.empty((1, 1))      # High-Cloud-Cover
lcc = np.empty((1, 1))      # Low-Cloud-Cover
tcc = np.empty((1, 1))      # Total-Cloud-Cover
# neu
tp = np.empty((1, 1))       # Total Precipitation
d2m = np.empty((1, 1))      # 2 Metre Dewpoint Temperature
t2m = np.empty((1, 1))      # 2 Metre Temperature
sund = np.empty((1, 1))     # Sunshine Duration
ssrd = np.empty((1, 1))     # Surface Solar Radiation Downwards

In [10]:
# Zuordnen der Variabeln zu entsprechenden NP-Arrays
# fehlende Werte werden durch NaN Werte ersetzt, das mehrdimensionale Array wird zu einem eindimensonalen formatiert

for nc_file in netcdf_files:
    latitudes = np.append(latitudes, nc_file.variables['latitude'][0])
    longitudes = np.append(longitudes, nc_file.variables['longitude'][0])
    time = np.append(time, nc_file.variables['time'][:].filled(np.nan))
    aod469 = np.append(aod469, nc_file.variables['aod469'][:].filled(np.nan).reshape(-3))
    aod550 = np.append(aod550, nc_file.variables['aod550'][:].filled(np.nan).reshape(-3))
    gtco3 = np.append(gtco3, nc_file.variables['gtco3'][:].filled(np.nan).reshape(-3))
    uvbed = np.append(uvbed, nc_file.variables['uvbed'][:].filled(np.nan).reshape(-3))
    uvbedcs = np.append(uvbedcs, nc_file.variables['uvbedcs'][:].filled(np.nan).reshape(-3))
    hcc = np.append(hcc, nc_file.variables['hcc'][:].filled(np.nan).reshape(-3))
    lcc = np.append(lcc, nc_file.variables['lcc'][:].filled(np.nan).reshape(-3))
    tcc = np.append(tcc, nc_file.variables['tcc'][:].filled(np.nan).reshape(-3))
    # neu
    tp = np.append(uvbedcs, nc_file.variables['tp'][:].filled(np.nan).reshape(-3))
    d2m = np.append(hcc, nc_file.variables['d2m'][:].filled(np.nan).reshape(-3))
    t2m = np.append(lcc, nc_file.variables['t2m'][:].filled(np.nan).reshape(-3))
    sund = np.append(tcc, nc_file.variables['sund'][:].filled(np.nan).reshape(-3))
    ssrd = np.append(tcc, nc_file.variables['ssrd'][:].filled(np.nan).reshape(-3))

In [11]:
# Größe der verschiedenen Datensätze
print(len(time), len(aod469), len(aod550), len(uvbed), len(uvbedcs), len(hcc), len(lcc), len(tcc), len(tp), len(d2m), len(t2m), len(sund), len(ssrd))

10225 10225 10225 10225 10225 10225 10225 10225 10945 10945 10945 10945 10945


## Berechnen der richtigen DateTime-Werte und erstellen eines Dataframes:

In [12]:
# richtigen Zeitstempel berechnen
datetime = np.empty((1,1))
datetime = np.array([origin + dt.timedelta(hours=int(i)) for i in time])

In [13]:
# Zuorden der NP-Arrays in einen Dataframe
df_cams = pd.DataFrame({'Datetime': datetime, 'aod469': aod469, 'aod550': aod550, 'gtco3': gtco3,
                         'uvbed': uvbed,'uvbedcs': uvbedcs, 'hcc': hcc, 'lcc': lcc, 'tcc': tcc})

In [14]:
# UV-Index berechnen
df_cams['uvbed'] = df_cams['uvbed'] * 40
df_cams['uvbedcs'] = df_cams['uvbedcs'] * 40

In [15]:
# Zeitstempel in Pandas-DateTime umwandeln
df_cams['Datetime'] = pd.to_datetime(df_cams['Datetime'], format='%Y-%m-%d %H:%M:%S')
# Index des DF setzen
df_cams.set_index('Datetime', inplace=True)

In [16]:
# Sontieren des DataFrame in die richtige Reihenfolge
df_cams_sorted = pd.DataFrame()
df_cams_sorted = df_cams.sort_index().drop(df_cams.index[0])

In [17]:
# Ausgabe des sotierten Dataframes
df_cams_sorted

Unnamed: 0_level_0,aod469,aod550,gtco3,uvbed,uvbedcs,hcc,lcc,tcc
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-06-01 00:00:00,0.261836,0.211588,0.008161,-5.551115e-16,5.551115e-16,0.000000,0.000000,0.504647
2022-06-01 01:00:00,0.272371,0.220283,0.008044,-5.551115e-16,5.551115e-16,0.000000,0.000000,0.681122
2022-06-01 02:00:00,0.237297,0.190401,0.007992,-5.551115e-16,5.551115e-16,0.124487,0.015687,0.414249
2022-06-01 03:00:00,0.195020,0.155880,0.007971,-5.551115e-16,5.551115e-16,0.071994,0.005310,0.076343
2022-06-01 04:00:00,0.164975,0.131777,0.007959,7.204596e-02,7.198133e-02,0.000000,0.001114,0.001785
...,...,...,...,...,...,...,...,...
2023-07-31 19:00:00,0.131174,0.109557,0.006663,5.551115e-16,5.551115e-16,1.000000,0.259121,1.000000
2023-07-31 20:00:00,0.126791,0.106162,0.006666,5.551115e-16,5.551115e-16,1.000000,0.473365,1.000000
2023-07-31 21:00:00,0.133466,0.111809,0.006700,5.551115e-16,5.551115e-16,0.999878,0.934537,1.000000
2023-07-31 22:00:00,0.162170,0.136924,0.006741,5.551115e-16,5.551115e-16,1.000000,0.984985,1.000000


## Prüfen, ob der Dataframe alle Einträge enthält:

In [18]:
# Prüfen, ob der Dataframe alle Einträge enthält
for i in range(len(df_cams_sorted) - 1):
    time_diff = df_cams_sorted.index[i + 1] - df_cams_sorted.index[i]
    if time_diff != pd.to_timedelta('1H'):
        print(f"Index {i} und Index {i + 1} haben keinen Abstand von einer Stunde.")

## Abspeichern des Dataframes mit stündlicher Auflösung:

In [19]:
def save_dataframe_to_drive(dataframe, folder_path, filename):

    # Erstellen des vollen Pfades zur Datei
    full_path = os.path.join(folder_path, filename)

    # Speichern des DataFrame als CSV-Datei auf Google-Drive
    dataframe.to_csv(full_path)

    print(f'Der DataFrame wurde als {filename} in {folder_path} auf Google Drive gespeichert.')

In [20]:
# Abspeichern des konvertierten DF als CSV
save_dataframe_to_drive(df_cams_sorted, folder_export, name_export)

Der DataFrame wurde als CAMS_std_CSV in /content/drive/My Drive/Colab_Notebooks/CAMS_Vorhersage/ auf Google Drive gespeichert.


## Abspeichern des Dataframes mit 2 Min. Auflösung.

In [21]:
# Resampling der Zeitreihe auf 2 Min. und interpolation der fehlenden Messwerte
# df_cams_resampled = df_cams.resample('2T')
# df_cams_interpolated = df_cams_resampled.interpolate(method='polynomial', order = 1)

In [22]:
# Abspeichern des konvertierten DF als CSV
# dateiname = 'CSV_Cams_2M'
# save_dataframe_to_drive(df_cams_interpolated, '/content/drive/My Drive/Colab_Notebooks/CSV_Vorhersage', dateiname)

In [23]:
# dateiname = 'pickle_Cams_2M'
# df_cams_interpolated.to_pickle(pickle_path + dateiname)