<a href="https://colab.research.google.com/github/rjanow/Masterarbeit/blob/main/netCDF4_to_CSV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script um die netCDF4 Vorhersagewerte in einen Dataframe abzuspeichern

## Allgemeine Einstelllungen:

In [1]:
!pip install netCDF4

Collecting netCDF4
  Downloading netCDF4-1.6.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cftime (from netCDF4)
  Downloading cftime-1.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cftime, netCDF4
Successfully installed cftime-1.6.3 netCDF4-1.6.5


In [2]:
from google.colab import drive

# Mounten des Google-Drive Kontos
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Import der benötigten Module
import os
import pickle
import netCDF4 as nc
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt

In [4]:
# Zeit wird in Stunden seit 1.1.1900 00:00 angegeben
origin = dt.datetime(1900, 1, 1, 0, 0, 0, 0)

In [5]:
# Drive Ordner mit den Vorhersagewerten im NetCDF4-Format
drive_folder = '/content/drive/My Drive/Colab_Notebooks/netCDF4_Wetterdaten/'
pickle_path = '/content/drive/My Drive/Colab_Notebooks/CAMS_Vorhersage/'

## Einlesen der NetCDF4-Dateien:

In [6]:
def read_nc_files_in_google_drive(folder_path):
    nc_files = []  # Liste, um die NetCDF4-Datensätze abzuspeichern

    if os.path.exists(folder_path):
        # Durchsuchen des Drive-Ordners nach Dateien
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)

            # Überprüfen, ob die Datei eine NetCDF4-Datei ist
            if file_name.endswith(".nc"):
                try:
                    # Öffnen der NetCDF4-Datei und huzfügen zur Liste
                    nc_file = nc.Dataset(file_path)
                    nc_files.append(nc_file)
                    print(nc_file)
                except Exception as e:
                    print(f"Fehler beim Öffnen von {file_name}: {str(e)}")

    return nc_files

In [7]:
# Funktionsaufruf zum Einlesen der NetCDF4-Dateien
netcdf_files = read_nc_files_in_google_drive(drive_folder)

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_64BIT_OFFSET data model, file format NETCDF3):
    Conventions: CF-1.6
    history: 2023-10-12 13:31:19 GMT by grib_to_netcdf-2.25.1: /opt/ecmwf/mars-client/bin/grib_to_netcdf.bin -S param -o /cache/tmp/148d7980-8bd4-4e6e-a855-6462a6e89a10-adaptor.mars_constrained.external-1697117466.804518-29252-7-tmp.nc /cache/tmp/148d7980-8bd4-4e6e-a855-6462a6e89a10-adaptor.mars_constrained.external-1697117461.6633863-29252-6-tmp.grib
    dimensions(sizes): longitude(1), latitude(1), time(720)
    variables(dimensions): float32 longitude(longitude), float32 latitude(latitude), int32 time(time), int16 hcc(time, latitude, longitude), int16 lcc(time, latitude, longitude), int16 mcc(time, latitude, longitude), int16 aod469(time, latitude, longitude), int16 aod550(time, latitude, longitude), int16 aod670(time, latitude, longitude), int16 aod865(time, latitude, longitude), int16 tcc(time, latitude, longitude), int16 gtco3(time, latitude, longitude), i

In [8]:
netcdf_files

[<class 'netCDF4._netCDF4.Dataset'>
 root group (NETCDF3_64BIT_OFFSET data model, file format NETCDF3):
     Conventions: CF-1.6
     history: 2023-10-12 13:31:19 GMT by grib_to_netcdf-2.25.1: /opt/ecmwf/mars-client/bin/grib_to_netcdf.bin -S param -o /cache/tmp/148d7980-8bd4-4e6e-a855-6462a6e89a10-adaptor.mars_constrained.external-1697117466.804518-29252-7-tmp.nc /cache/tmp/148d7980-8bd4-4e6e-a855-6462a6e89a10-adaptor.mars_constrained.external-1697117461.6633863-29252-6-tmp.grib
     dimensions(sizes): longitude(1), latitude(1), time(720)
     variables(dimensions): float32 longitude(longitude), float32 latitude(latitude), int32 time(time), int16 hcc(time, latitude, longitude), int16 lcc(time, latitude, longitude), int16 mcc(time, latitude, longitude), int16 aod469(time, latitude, longitude), int16 aod550(time, latitude, longitude), int16 aod670(time, latitude, longitude), int16 aod865(time, latitude, longitude), int16 tcc(time, latitude, longitude), int16 gtco3(time, latitude, longitu

In [9]:
# Variablen Deklaration
latitudes = np.empty((1, 1))
longitudes = np.empty((1, 1))
time = np.empty((1, 1))
aod469 = np.empty((1, 1))   # Aerosol Optical Depth at 469
aod550 = np.empty((1, 1))   # Aerosol Optical Depth at 550
aod670 = np.empty((1, 1))   # Aerosol Optical Depth at 670
aod865 = np.empty((1, 1))   # Aerosol Optical Depth at 865
gtco3 = np.empty((1, 1))    # Total Ozon Column
uvbed = np.empty((1, 1))    # UVI All-Sky
uvbedcs = np.empty((1, 1))  # UVI Clear-Sky
hcc = np.empty((1, 1))      # High-Cloud-Cover
lcc = np.empty((1, 1))      # Low-Cloud-Cover
mcc = np.empty((1, 1))      # Medium-Cloud-Cover
tcc = np.empty((1, 1))      # Total-Cloud-Cover
cbh = np.empty((1, 1))      # Cloud-Base-Hight

In [10]:
# Zuordnen der Variabeln zu entsprechenden NP-Arrays
# fehlende Werte werden durch NaN Werte ersetzt, das mehrdimensionale Array wird zu einem eindimensonalen formatiert
for nc_file in netcdf_files:
    latitudes = np.append(latitudes, nc_file.variables['latitude'][0])
    longitudes = np.append(longitudes, nc_file.variables['longitude'][0])
    time = np.append(time, nc_file.variables['time'][:].filled(np.nan))
    aod469 = np.append(aod469, nc_file.variables['aod469'][:].filled(np.nan).reshape(-3))
    aod550 = np.append(aod550, nc_file.variables['aod550'][:].filled(np.nan).reshape(-3))
    aod670 = np.append(aod670, nc_file.variables['aod670'][:].filled(np.nan).reshape(-3))
    aod865 = np.append(aod865, nc_file.variables['aod865'][:].filled(np.nan).reshape(-3))
    gtco3 = np.append(gtco3, nc_file.variables['gtco3'][:].filled(np.nan).reshape(-3))
    uvbed = np.append(uvbed, nc_file.variables['uvbed'][:].filled(np.nan).reshape(-3))
    uvbedcs = np.append(uvbedcs, nc_file.variables['uvbedcs'][:].filled(np.nan).reshape(-3))
    hcc = np.append(hcc, nc_file.variables['hcc'][:].filled(np.nan).reshape(-3))
    lcc = np.append(lcc, nc_file.variables['lcc'][:].filled(np.nan).reshape(-3))
    mcc = np.append(mcc, nc_file.variables['mcc'][:].filled(np.nan).reshape(-3))
    tcc = np.append(tcc, nc_file.variables['tcc'][:].filled(np.nan).reshape(-3))
    cbh = np.append(cbh, nc_file.variables['cbh'][:].filled(np.nan).reshape(-3))

In [11]:
print(len(time), len(aod469), len(aod550), len(aod670), len(aod865), len(aod865), len(uvbed), len(uvbedcs), len(hcc), len(lcc), len(mcc), len(tcc), len(cbh))

11689 11689 11689 11689 11689 11689 11689 11689 11689 11689 11689 11689 11689


In [12]:
# richtige DateTime berechen
datetime = np.empty((1,1))
datetime = np.array([origin + dt.timedelta(hours=int(i)) for i in time])

In [13]:
# Zuorden der NP-Arrays in einen Dataframe
df_cams = pd.DataFrame({'Datetime': datetime, 'aod469': aod469, 'aod550': aod550,'aod670': aod670,'aod865': aod865,'aod865': gtco3,
                        'uvbed': uvbed,'uvbedcs': uvbedcs, 'hcc': hcc, 'lcc': lcc, 'mcc': mcc, 'tcc': tcc, 'cbh': cbh})

In [14]:
df_cams['uvbed'] = df_cams['uvbed'] * 40
df_cams['uvbedcs'] = df_cams['uvbedcs'] * 40

In [15]:
df_cams['Datetime'] = pd.to_datetime(df_cams['Datetime'], format='%Y-%m-%d %H:%M:%S')

In [16]:
# Index des DF setzen
df_cams.set_index('Datetime', inplace=True)

In [17]:
# Sontieren des DataFrame in die richtige Reihenfolge
df_cams_sorted = pd.DataFrame()
df_cams_sorted = df_cams.sort_index().drop(df_cams.index[0])

In [18]:
# Ausgabe des sotierten Dataframes
df_cams_sorted

Unnamed: 0_level_0,aod469,aod550,aod670,aod865,uvbed,uvbedcs,hcc,lcc,mcc,tcc,cbh
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-06-01 00:00:00,0.279180,0.224989,0.166234,0.008305,5.551115e-16,5.551115e-16,0.000000,0.044893,0.367693,0.371660,2119.024617
2022-06-01 01:00:00,0.298263,0.241389,0.179282,0.008164,5.551115e-16,5.551115e-16,0.000000,0.156517,0.360444,0.408237,1342.479134
2022-06-01 02:00:00,0.257216,0.207651,0.153842,0.008086,5.551115e-16,5.551115e-16,0.879633,0.198541,0.250774,0.914822,1363.938076
2022-06-01 03:00:00,0.204455,0.164444,0.121328,0.008019,5.551115e-16,5.551115e-16,0.771794,0.054888,0.028138,0.785864,2026.099732
2022-06-01 04:00:00,0.169221,0.136005,0.100293,0.007981,7.093021e-02,7.105593e-02,0.039751,0.005265,0.003006,0.047488,
...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30 19:00:00,0.303813,0.246127,0.181648,0.005928,0.000000e+00,0.000000e+00,0.875284,0.000015,0.132758,0.892131,4631.659686
2023-09-30 20:00:00,0.288060,0.233771,0.172870,0.005911,0.000000e+00,0.000000e+00,0.311812,0.000000,0.002487,0.313766,11384.323732
2023-09-30 21:00:00,0.266354,0.216345,0.160130,0.005897,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,
2023-09-30 22:00:00,0.240483,0.195343,0.144486,0.005882,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000732,0.000732,


## Prüfen, ob der Dataframe alle Einträge enthält:

In [19]:
# Prüfen, ob der Dataframe alle Einträge enthält
for i in range(len(df_cams_sorted) - 1):
    time_diff = df_cams_sorted.index[i + 1] - df_cams_sorted.index[i]
    if time_diff != pd.to_timedelta('1H'):
        print(f"Index {i} und Index {i + 1} haben keinen Abstand von einer Stunde.")

## Abspeichern der Vorhersagewerte mit stündlicher Auflösung:

In [20]:
def save_dataframe_to_drive(dataframe, folder_path, filename):

    # Erstellen des vollen Pfades zur Datei
    full_path = os.path.join(folder_path, filename)

    # Speichern des DataFrame als CSV-Datei auf Google Drive
    dataframe.to_csv(full_path)

    print(f'Der DataFrame wurde als {filename} in {folder_path} auf Google Drive gespeichert.')

In [21]:
# Abspeichern des konvertierten DF als CSV
dateiname = 'CSV_Cams_std'
save_dataframe_to_drive(df_cams_sorted, pickle_path, dateiname)

Der DataFrame wurde als CSV_Cams_std in /content/drive/My Drive/Colab_Notebooks/CAMS_Vorhersage/ auf Google Drive gespeichert.


In [22]:
df_cams_sorted

Unnamed: 0_level_0,aod469,aod550,aod670,aod865,uvbed,uvbedcs,hcc,lcc,mcc,tcc,cbh
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-06-01 00:00:00,0.279180,0.224989,0.166234,0.008305,5.551115e-16,5.551115e-16,0.000000,0.044893,0.367693,0.371660,2119.024617
2022-06-01 01:00:00,0.298263,0.241389,0.179282,0.008164,5.551115e-16,5.551115e-16,0.000000,0.156517,0.360444,0.408237,1342.479134
2022-06-01 02:00:00,0.257216,0.207651,0.153842,0.008086,5.551115e-16,5.551115e-16,0.879633,0.198541,0.250774,0.914822,1363.938076
2022-06-01 03:00:00,0.204455,0.164444,0.121328,0.008019,5.551115e-16,5.551115e-16,0.771794,0.054888,0.028138,0.785864,2026.099732
2022-06-01 04:00:00,0.169221,0.136005,0.100293,0.007981,7.093021e-02,7.105593e-02,0.039751,0.005265,0.003006,0.047488,
...,...,...,...,...,...,...,...,...,...,...,...
2023-09-30 19:00:00,0.303813,0.246127,0.181648,0.005928,0.000000e+00,0.000000e+00,0.875284,0.000015,0.132758,0.892131,4631.659686
2023-09-30 20:00:00,0.288060,0.233771,0.172870,0.005911,0.000000e+00,0.000000e+00,0.311812,0.000000,0.002487,0.313766,11384.323732
2023-09-30 21:00:00,0.266354,0.216345,0.160130,0.005897,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,
2023-09-30 22:00:00,0.240483,0.195343,0.144486,0.005882,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000732,0.000732,


In [23]:
# dateiname = 'pickle_CAMS_std'
# df_cams_sorted.to_pickle(pickle_path + dateiname)

## Abspeichern des Dataframe mit 2 Min. Auflösung.

In [24]:
# Resampling der Zeitreihe auf 2 Min. und interpolation der fehlenden Messwerte
# df_cams_resampled = df_cams.resample('2T')
# df_cams_interpolated = df_cams_resampled.interpolate(method='polynomial', order = 1)

KeyboardInterrupt: ignored

In [None]:
# Abspeichern des konvertierten DF als CSV
# dateiname = 'CSV_Cams_2M'
# save_dataframe_to_drive(df_cams_interpolated, '/content/drive/My Drive/Colab_Notebooks/CSV_Vorhersage', dateiname)

In [None]:
# dateiname = 'pickle_Cams_2M'
# df_cams_interpolated.to_pickle(pickle_path + dateiname)