# Converting netCDF to CSV format
### Objective:
- netCDF files are large size and can take much more resource to compute for data analysis
- Filtering the netCDF file to extract only required attributes/features: 
    * sounding_id => DateTime
    * Xco2 => XCO2 ppm
    * Latitude, Longitude => coordinates
    * xco2_quality_flag => ( 0 =>good quality, 1 => bad quality)
- Convert netCDF to CSV format to reduce the size of data


### STEPS: 
* EXPLORE the Files from all directories and CONCATENATE as a single path
* Collect the files paths from different directories
* final output: csv files format

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import os
import netCDF4 as nc

# converting the datetime format
from datetime import datetime

## Path to NETCDF files
- Locate the downloaded netcdf files directory in pc directory

In [4]:
path_a= ('multiple_netcdf_files/')

# Collect the paths of each individual files
file_names= [multiple_netcdf_files]

for file in os.listdir(path_a):
    # Check whether file is in text format or not
    if file.endswith(".nc4"):
        file_path = f"{path_a}\{file}"
      
        # Store the path location of each individual files
        file_names.append(file_path)
        
        
# check first 10 files path
file_names[:10]

NameError: ignored

## If netCDF files are located on different directory paths:
### NOTE: Select the Root PATH for files on different folders
- Run this script below if the individual files are located in seperate folders
- How it works: 
    - 1. Provide the root direcory
    - 2. Loop searches individual files at end paths of each directories
    - 3. Concatenates the path from ROOT dir. to individual file path from each dir.

In [None]:
# # list fo FILES 2021
# file_path_2021= []

# for root, dirs, files in os.walk('../../../Clusters_DATA_oil/OCO-2/2018/'):
#     for filename in files:
#         print(os.path.join(root, filename))
        
#         # Append the files into list
#         file_path_2021.append(os.path.join(root, filename))

In [None]:
#files= os.listdir('../ENTIRE_datasets/OCO-2_datasets/2019_2020/')

# files= os.listdir('')
# # LISTING the path of FILES
# files

# Example: 
### Opening a single file in netCDF format

In [None]:
df_xco2= nc.Dataset('multiple_netcdf_files/oco2_LtCO2_190101_B10206Ar_200729172616s.nc4')

In [None]:
list(df_xco2.variables.keys())

['sounding_id',
 'levels',
 'bands',
 'vertices',
 'footprints',
 'date',
 'latitude',
 'longitude',
 'time',
 'solar_zenith_angle',
 'sensor_zenith_angle',
 'xco2_quality_flag',
 'xco2_qf_bitflag',
 'xco2_qf_simple_bitflag',
 'source_files',
 'file_index',
 'vertex_latitude',
 'vertex_longitude',
 'xco2',
 'xco2_uncertainty',
 'xco2_apriori',
 'pressure_levels',
 'co2_profile_apriori',
 'xco2_averaging_kernel',
 'pressure_weight']

### Filtering specific attributes from the netCDF files
- xco2
- xco2_quality_flag
- latitude
- longitude
- sounding_id (DateTime)

***********************************************************************************************
***********************************************************************************************
***********************************************************************************************

# DateTime format Change

In [None]:
# DATE time function
def conv_date(d):
    return datetime.strptime(str(d), '%Y%m%d%H%M%S%f')

# Check the total files in the DIRECTORY

In [None]:
countFiles=0

for j in file_names:
    if j.endswith(".nc4"):
        countFiles+=1
        #print(j)
        
print('\nTotalFiles: ', countFiles)


TotalFiles:  9


### Function:
* Function below takes individual path of files and converts to CSV/TXT format
* Converted files are created on the same dir. of the code

# Storing the files on specified directory: csv_files folder

In [None]:
#creatin g a FOLDER
current_directory= os.getcwd()
frames_folder= os.path.join(current_directory, r'csv_files')

if not os.path.exists(frames_folder):
    os.makedirs(frames_folder)

### NOTE:
- Refine the ENTIRE dataframe by GOOD quality_flag->0
- NOTE: REDUCES the size of the file

In [None]:
# FUNCTION to convert data

def convHdf(path_file, n=0):

    data= nc.Dataset(path_file)

    # get the HDF data and convert to CSV
    df_xco2= pd.DataFrame()

    df_xco2['Xco2']= data.variables['xco2'][:]
    df_xco2['Latitude']= data.variables['latitude'][:]
    df_xco2['Longitude']= data.variables['longitude'][:] 
    df_xco2['quality_flag']= data.variables['xco2_quality_flag'][:] 
    
    # Date
    df_xco2['DateTime']= data.variables['sounding_id'][:]
    
    #Convert soundingID to datetime format
    df_xco2['DateTime']= df_xco2['DateTime'].apply(conv_date)
    df_xco2['DateTime']= pd.to_datetime(df_xco2['DateTime'])
    
    # YEAR and month column
    df_xco2['Year']= df_xco2['DateTime'].dt.year
    df_xco2['Month']= df_xco2['DateTime'].dt.month
    df_xco2['Day']= df_xco2['DateTime'].dt.day
    
    # Refine the ENTIRE dataframe by GOOD quality_flag->0
    # NOTE: REDUCES the size of the file
    df_xco2= df_xco2[df_xco2['quality_flag'] == 0]   
    
   
    date= str(data.variables['sounding_id'][0])      
    
    # create a CSV and store on new folder: csv_files
    df_xco2.to_csv('csv_files'+'/'+ data.Sensor+'_xco2_'+ date+'_.txt', index= False)

# OCO3 for SIF conversion

In [None]:
# # FUNCTION to convert data
# def convOCO3(path_file, n=0):

#     #path= '../hdf_format/Los_angeles_GROUPED/'
#     data= nc.Dataset(path_file)

#     # get the HDF data and convert to CSV
#     df_sif= pd.DataFrame()

#     df_sif['sif_757nm']= data.variables['Daily_SIF_757nm'][:]
#     df_sif['Latitude']= data.variables['Latitude'][:]
#     df_sif['Longitude']= data.variables['Longitude'][:] 
#     df_sif['quality_flag']= data.variables['Quality_Flag'][:] 
    
#     # Date
#     # Date time not found 
# #     df_xco2['DateTime']= data.variables['sounding_id'][:]
    
# #     #Convert soundingID to datetime format
# #     df_xco2['DateTime']= df_xco2['DateTime'].apply(conv_date)
# #     df_xco2['DateTime']= pd.to_datetime(df_xco2['DateTime'])
    
# #     # YEAR and month column
# #     df_xco2['Year']= df_xco2['DateTime'].dt.year
# #     df_xco2['Month']= df_xco2['DateTime'].dt.month
# #     df_xco2['Day']= df_xco2['DateTime'].dt.day
    
    
#     # xco2 quality flag -> 0
#  #   df_sif= df_sif[df_sif['quality_flag'] == 0]
    
# #    date= str(data.variables['sounding_id'][0])                                   
#     # create a CSV
#     # OCO3 sensor
#     df_sif.to_csv(data.sensor[:5]+'_sif_'+str(n)+ '_.txt', index= False)
# #     df_xco2.to_feather(data.Sensor+'_xco2_'+ date+'_.txt')

# Testing: Single files transformation

In [None]:
convHdf(file_names[0])

## NOTE: Filtering XCO2 quality flag(0) to reduce the total size of file

In [None]:
# using Function to READ FILES from the direcotry and convert all netCDF files to csv/txt    

for j in range(0, len(file_names)):
  
       # EG to read FIRST dataset from THE DIRECTORY       
        convHdf(file_names[j], j)