# Conversion of NETCDF to CSV

# WHY conversion?
- Most scientifc datasets comes with netCDF/ HDF format 
- To store large number of records and datasets that has high number of Variables/Attributes

# Libraries reqd to install
- netCDF, HDF

In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import h5py as h5
import matplotlib.pyplot as plt
import matplotlib
import os
import netCDF4 as netCDF

# netCDF
import netCDF4 as nc

# basemap
from mpl_toolkits.basemap import Basemap
from datetime import datetime

# Path to NETCDF files

# READING the pre-built PATH for files for OCO-2
- Here, Path File has been CREATED previously <b>2019_path_to_OCO-2.txt</b> where the files NAMES are collected from SPECIFIC path
- seperate code in the same directory to collect the filename

In [2]:
# path to netCdf Files
# MONTH: 8-12
with open('2019_path_to_OCO-2.txt') as f:
    path_oco_2= f.readlines()


# Collected PAths of the files

In [3]:
path_oco_2[0].strip('\n')

'../../Clusters_DATA_oil/OCO-2/2019/01\\01\\LtCO2\\oco2_LtCO2_190101_B10206Ar_200729172616s.nc4'

In [4]:
path_oco_2[4].strip('\n')

'../../Clusters_DATA_oil/OCO-2/2019/01\\05\\LtCO2\\oco2_LtCO2_190105_B10206Ar_200729173113s.nc4'

# TESTING: readint the netCDF file from the DIRECTORY

In [5]:
df_xco2_LITE= nc.Dataset(path_oco_2[3].strip('\n'))

In [6]:
df_xco2_LITE.variables

{'sounding_id': <class 'netCDF4._netCDF4.Variable'>
 uint64 sounding_id(sounding_id)
     units: YYYYMMDDhhmmssmf
     long_name: OCO2 Sounding ID
     missing_value: 0
     comment: from scan time in UTC
 unlimited dimensions: 
 current shape = (170580,)
 filling on, default _FillValue of 18446744073709551614 used,
 'levels': <class 'netCDF4._netCDF4.Variable'>
 int16 levels(levels)
 unlimited dimensions: 
 current shape = (20,)
 filling on, default _FillValue of -32767 used,
 'bands': <class 'netCDF4._netCDF4.Variable'>
 int16 bands(bands)
 unlimited dimensions: 
 current shape = (3,)
 filling on, default _FillValue of -32767 used,
 'vertices': <class 'netCDF4._netCDF4.Variable'>
 int16 vertices(vertices)
 unlimited dimensions: 
 current shape = (4,)
 filling on, default _FillValue of -32767 used,
 'footprints': <class 'netCDF4._netCDF4.Variable'>
 int16 footprints(footprints)
 unlimited dimensions: 
 current shape = (8,)
 filling on, default _FillValue of -32767 used,
 'date': <clas

# EXAMPLE
- READING netCDF file 
- LISTING the variables inside the file

# OPEN files of netCDF format

In [7]:
df_xco2= netCDF.Dataset('../../Clusters_DATA_oil/OCO-2/2019/01/01/LtCO2/oco2_LtCO2_190101_B10206Ar_200729172616s.nc4')

In [8]:
df_xco2.Sensor

'OCO-2'

# Xco2

In [10]:
df_xco2.variables['xco2'].shape

(190424,)

# Xco2 Quality_flag

In [11]:
df_xco2.variables['xco2_quality_flag'][:]

masked_array(data=[1, 1, 1, ..., 1, 1, 1],
             mask=False,
       fill_value=999999,
            dtype=int8)

# Sounding_ID

In [12]:
df_xco2.variables['sounding_id'][0]

masked_array(data=2019010100235138,
             mask=False,
       fill_value=999999,
            dtype=uint64)

# Date

In [13]:
df_xco2.variables['date'].shape

(190424, 7)

In [14]:
df_xco2.variables['date'][9]

masked_array(data=[2019,    1,    1,    0,   25,   42,  300],
             mask=False,
       fill_value=999999,
            dtype=int16)

In [15]:
df_date= pd.DataFrame()

# TIME

In [16]:
df_xco2.variables['time'][0:4]

masked_array(data=[1.54630223e+09, 1.54630228e+09, 1.54630234e+09,
                   1.54630234e+09],
             mask=False,
       fill_value=1e+20)

# LATITUDE

# xco2_uncertanity

In [17]:
df_xco2.variables['xco2_uncertainty'][0]*1000

528.3332467079163

In [18]:
df_xco2.variables['xco2'][:]

masked_array(data=[410.8217 , 408.63104, 407.17587, ..., 408.55084,
                   406.5148 , 407.6041 ],
             mask=False,
       fill_value=1e+20,
            dtype=float32)

# Footprints

In [19]:
df_xco2.variables['bands'][:]

masked_array(data=[1, 2, 3],
             mask=False,
       fill_value=999999,
            dtype=int16)

***************************************************************
***************************************************************
***************************************************************
***************************************************************

# Function to CONVERT 'sounding_id' to DateTime format

# Date TIme format Changing

In [20]:
# DATE time function
def conv_date(d):
    return datetime.strptime(str(d), '%Y%m%d%H%M%S%f')

# CONVERSION of NETCDF
- SAving the file as CSV in the same directory

In [21]:
# FUNCTION to convert data

def convHdf(data, n=0):

    #path= '../hdf_format/Los_angeles_GROUPED/'
    #data_hf= h5.File(path+ data,'r')
 

    # get the HDF data and convert to CSV
    df_xco2= pd.DataFrame()

    df_xco2['Xco2']= data.variables['xco2'][:]
    df_xco2['Latitude']= data.variables['latitude'][:]
    df_xco2['Longitude']= data.variables['longitude'][:] 
    df_xco2['quality_flag']= data.variables['xco2_quality_flag'][:] 
    
    df_xco2['DateTime']= data.variables['sounding_id'][:]
    #Convert soundingID to datetime format
    df_xco2['DateTime']= df_xco2['DateTime'].apply(conv_date)
    df_xco2['DateTime']= pd.to_datetime(df_xco2['DateTime'])
    
    # YEAR and month column
    df_xco2['Year']= df_xco2['DateTime'].dt.year
    df_xco2['Month']= df_xco2['DateTime'].dt.month
    df_xco2['Day']= df_xco2['DateTime'].dt.day
    
    date= str(data.variables['sounding_id'][0])                                   
    # create a CSV
    df_xco2.to_csv(data.Sensor+'_xco2_'+ date+'_.csv', index= False)

# READING files from the PATH list

In [22]:
len(path_oco_2)

355

In [24]:
path_oco_2

['../../Clusters_DATA_oil/OCO-2/2019/01\\01\\LtCO2\\oco2_LtCO2_190101_B10206Ar_200729172616s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\02\\LtCO2\\oco2_LtCO2_190102_B10206Ar_200729172942s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\03\\LtCO2\\oco2_LtCO2_190103_B10206Ar_200729173012s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\04\\LtCO2\\oco2_LtCO2_190104_B10206Ar_200729173043s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\05\\LtCO2\\oco2_LtCO2_190105_B10206Ar_200729173113s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\06\\LtCO2\\oco2_LtCO2_190106_B10206Ar_200729173145s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\07\\LtCO2\\oco2_LtCO2_190107_B10206Ar_200729173429s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\08\\LtCO2\\oco2_LtCO2_190108_B10206Ar_200729173516s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\09\\LtCO2\\oco2_LtCO2_190109_B10206Ar_200729173521s.nc4\n',
 '../../Clusters_DATA_oil/OCO-2/2019/01\\10\\LtCO2\\oco2_LtCO2_190110_B10206Ar_200729173607

# 2020
- Total files for 2020---> 297

# !!!!! Change the RANGE in FOR LOOP after calculating the TOTAL FILES in the PATH

In [27]:
total_files= len(path_oco_2)
total_files

355

In [28]:
# # # # using Function to READ directory and convert all HDF files to csv    
# # # # LISTING the path of FILES

for k in range(0, 356):
    data_hf= netCDF.Dataset(path_oco_2[k].strip('\n'))

    convHdf(data_hf)

IndexError: list index out of range