# Extracting files directly from OPENDAC website

### libraries : netCDF4, pydap.client (READ netCDF files)
### Library: 'BeautifulSoup' to read the contents of the webpage
* Reading the contents from the desired web page (here, OCO2)
* We pass the parameters: Instrument and year
* Save the links of different datasets in a list
* open up the lists using <b>netCDF</b>

# Data Retrieving from:
### Instrument and version: OCO2_L2_Lite_FP.10r
### Year: 2020

In [1]:
import pydap.client
import netCDF4 as nc

# for data-preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot

In [2]:
# TESTING with netcdf file, SINGLE dataset
# read files from netCDF
# Reading Single file from the link

In [3]:
my_df= nc.Dataset('https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200109_B10206Ar_200728203551s.nc4')

In [4]:
len(my_df.variables['xco2'])

148936

In [5]:
my_df.variables.keys()

dict_keys(['xco2_apriori', 'file_index', 'xco2_qf_simple_bitflag', 'pressure_levels', 'xco2', 'time', 'pressure_weight', 'Preprocessors_co2_ratio', 'Preprocessors_max_declocking_sco2', 'Preprocessors_max_declocking_o2a', 'Preprocessors_xco2_strong_idp', 'Preprocessors_max_declocking_wco2', 'Preprocessors_dp_abp', 'Preprocessors_co2_ratio_offset_per_footprint', 'Preprocessors_h2o_ratio', 'Preprocessors_xco2_weak_idp', 'Preprocessors_h2o_ratio_offset_per_footprint', 'solar_zenith_angle', 'longitude', 'xco2_qf_bitflag', 'latitude', 'sensor_zenith_angle', 'Meteorology_psurf_apriori_o2a', 'Meteorology_psurf_apriori_wco2', 'Meteorology_psurf_apriori_sco2', 'Meteorology_windspeed_u_met', 'Meteorology_windspeed_v_met', 'xco2_quality_flag', 'xco2_averaging_kernel', 'date', 'Retrieval_dp_o2a', 'Retrieval_dust_height', 'Retrieval_aod_water', 'Retrieval_s32', 'Retrieval_chi2_sco2', 'Retrieval_aod_dust', 'Retrieval_albedo_slope_wco2', 'Retrieval_aod_bc', 'Retrieval_aod_strataer', 'Retrieval_aod_sea

# Reading the contents from the WEB page and retrieving only the links
### Libraries: BeautifulSoup to read webElements
- Collecting the lists and using the links to open datasets using <b>NETCDF</b>

## WEB PAGE:
- Here, we can pass the user input for <b>Instrument and Year</b>
* url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+'/contents.html'

In [6]:
import requests

import urllib3
from bs4 import BeautifulSoup

In [7]:
url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/contents.html'

# USER INPUT: 
* Instrument and Version
* Year

In [8]:
# Pass the following parameters for this test
# Instrument: current testing(Pass) -> OCO2_L2_Lite_FP.10r
# year: 2020

instrument= input()
year=input()

OCO2_L2_Lite_FP.10r
2020


# Your Passed Parameters:
* Instrument
* Year

In [9]:
instrument,year

('OCO2_L2_Lite_FP.10r', '2020')

# READ contents from the URL:
* By passing the parameters from the USER input

In [10]:
my_url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+'/contents.html'

In [11]:
my_url

'https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/contents.html'

In [12]:
## Example
url

'https://oco2.gesdisc.eosdis.nasa.gov/opendap/OCO2_L2_Lite_FP.10r/2020/contents.html'

In [13]:
# Get the content from the webpage
reqs= requests.get(my_url)

# selecting the lxml parser
soup= BeautifulSoup(reqs.text, 'lxml')

In [14]:
# SOUP var, Here, returns the entire HTML contents

In [15]:
soup

<?xml version="1.0" encoding="UTF-8"?><html xmlns:bes="http://xml.opendap.org/ns/bes/1.0#">
<head>
<link href="/opendap/docs/css/contents.css" rel="stylesheet" type="text/css"/>
<title>OPeNDAP Hyrax: Contents of /OCO2_L2_Lite_FP.10r/2020/</title>
</head>
<body>
<img alt="OPeNDAP Logo" src="/opendap/docs/images/logo.png"/>
<h1>Contents of
                /OCO2_L2_Lite_FP.10r/2020/</h1>
<hr noshade="noshade" size="1"/>
<pre>
         </pre><table border="0" itemscope="" itemtype="http://schema.org/DataCatalog" width="100%">
<caption style="display:none">
<a href="#" itemprop="url">
<span itemprop="name">/OCO2_L2_Lite_FP.10r/2020/</span>
</a>
</caption>
<tr>
<th align="left">Name</th>
<th align="center">Last Modified</th>
<th align="center">Size</th>
<th align="center">DAP Response Links</th>
<th align="center">Dataset Viewers</th>
</tr>
<tr>
<td></td>
</tr>
<tr itemprop="dataset" itemscope="" itemtype="http://schema.org/Dataset">
<td align="left">
<b>
<a href="oco2_LtCO2_200101_B10206Ar_

# Filtering: to Get contents from the tag 
* "\<a>" only, which lists the contents
### Cleaning and saving the LINKS for the datasets only

In [16]:
# total links
oco2_links= []

for link in soup.find_all('a'):
    print(link.get('href'))
    oco2_links.append(link.get('href'))

#
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.html
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.ddx
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.dds
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.das
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.info
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.html
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.rdf
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200101_B10206Ar_200728183348s.nc4
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.xml
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.html
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.ddx
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.dds
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.das
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.info
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.html
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.rdf
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.covjson
/opendap/viewers/vi

oco2_LtCO2_200427_B10206Ar_210918003822s.nc4.dds
oco2_LtCO2_200427_B10206Ar_210918003822s.nc4.das
oco2_LtCO2_200427_B10206Ar_210918003822s.nc4.info
oco2_LtCO2_200427_B10206Ar_210918003822s.nc4.html
oco2_LtCO2_200427_B10206Ar_210918003822s.nc4.rdf
oco2_LtCO2_200427_B10206Ar_210918003822s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200427_B10206Ar_210918003822s.nc4
oco2_LtCO2_200427_B10206Ar_210918003822s.nc4.xml
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.html
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.ddx
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.dds
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.das
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.info
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.html
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.rdf
oco2_LtCO2_200428_B10206Ar_210513205811s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200428_B10206Ar_210513

oco2_LtCO2_200730_B10206Ar_210521204125s.nc4.das
oco2_LtCO2_200730_B10206Ar_210521204125s.nc4.info
oco2_LtCO2_200730_B10206Ar_210521204125s.nc4.html
oco2_LtCO2_200730_B10206Ar_210521204125s.nc4.rdf
oco2_LtCO2_200730_B10206Ar_210521204125s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200730_B10206Ar_210521204125s.nc4
oco2_LtCO2_200730_B10206Ar_210521204125s.nc4.xml
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.html
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.ddx
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.dds
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.das
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.info
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.html
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.rdf
oco2_LtCO2_200730_B10206Ar_210920060109s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_200730_B10206Ar_210920060109s.nc4
oco2_LtCO2_200730_B10206Ar_2109200601

oco2_LtCO2_201103_B10206Ar_210607214439s.nc4.info
oco2_LtCO2_201103_B10206Ar_210607214439s.nc4.html
oco2_LtCO2_201103_B10206Ar_210607214439s.nc4.rdf
oco2_LtCO2_201103_B10206Ar_210607214439s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_201103_B10206Ar_210607214439s.nc4
oco2_LtCO2_201103_B10206Ar_210607214439s.nc4.xml
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.html
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.ddx
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.dds
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.das
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.info
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.html
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.rdf
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.covjson
/opendap/viewers/viewers?dapService=/opendap/hyrax&datasetID=/OCO2_L2_Lite_FP.10r/2020/oco2_LtCO2_201103_B10206Ar_210921221348s.nc4
oco2_LtCO2_201103_B10206Ar_210921221348s.nc4.xml
oco2_LtCO2_201104_B10206Ar_2106072144

# Get the links ending with 'html' only
### Rest of the links have different purpose
### NOTE: another important link ending with 'info' gives information on Product


In [17]:

# storing the html links
dataset_links=[]

for k in range(0, len(oco2_links)):
    if oco2_links[k].endswith(".html"):
        print(oco2_links[k])
        
        # Strip the 'html' from the links 
        dataset_links.append(oco2_links[k].strip('.html'))

oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.html
oco2_LtCO2_200101_B10206Ar_200728183348s.nc4.html
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.html
oco2_LtCO2_200102_B10206Ar_200728203252s.nc4.html
oco2_LtCO2_200103_B10206Ar_200728203534s.nc4.html
oco2_LtCO2_200103_B10206Ar_200728203534s.nc4.html
oco2_LtCO2_200108_B10206Ar_200728203546s.nc4.html
oco2_LtCO2_200108_B10206Ar_200728203546s.nc4.html
oco2_LtCO2_200109_B10206Ar_200728203551s.nc4.html
oco2_LtCO2_200109_B10206Ar_200728203551s.nc4.html
oco2_LtCO2_200110_B10206Ar_200728203614s.nc4.html
oco2_LtCO2_200110_B10206Ar_200728203614s.nc4.html
oco2_LtCO2_200111_B10206Ar_200728203647s.nc4.html
oco2_LtCO2_200111_B10206Ar_200728203647s.nc4.html
oco2_LtCO2_200112_B10206Ar_200728203726s.nc4.html
oco2_LtCO2_200112_B10206Ar_200728203726s.nc4.html
oco2_LtCO2_200113_B10206Ar_200728203946s.nc4.html
oco2_LtCO2_200113_B10206Ar_200728203946s.nc4.html
oco2_LtCO2_200114_B10206Ar_200728204000s.nc4.html
oco2_LtCO2_200114_B10206Ar_200728204000s.nc4.html


oco2_LtCO2_200504_B10206Ar_210513211327s.nc4.html
oco2_LtCO2_200504_B10206Ar_210513211327s.nc4.html
oco2_LtCO2_200504_B10206Ar_210918220339s.nc4.html
oco2_LtCO2_200504_B10206Ar_210918220339s.nc4.html
oco2_LtCO2_200505_B10206Ar_210513211606s.nc4.html
oco2_LtCO2_200505_B10206Ar_210513211606s.nc4.html
oco2_LtCO2_200505_B10206Ar_210918220728s.nc4.html
oco2_LtCO2_200505_B10206Ar_210918220728s.nc4.html
oco2_LtCO2_200506_B10206Ar_210513211812s.nc4.html
oco2_LtCO2_200506_B10206Ar_210513211812s.nc4.html
oco2_LtCO2_200506_B10206Ar_210918220808s.nc4.html
oco2_LtCO2_200506_B10206Ar_210918220808s.nc4.html
oco2_LtCO2_200507_B10206Ar_210513212020s.nc4.html
oco2_LtCO2_200507_B10206Ar_210513212020s.nc4.html
oco2_LtCO2_200507_B10206Ar_210918221149s.nc4.html
oco2_LtCO2_200507_B10206Ar_210918221149s.nc4.html
oco2_LtCO2_200508_B10206Ar_210513212308s.nc4.html
oco2_LtCO2_200508_B10206Ar_210513212308s.nc4.html
oco2_LtCO2_200508_B10206Ar_210918221245s.nc4.html
oco2_LtCO2_200508_B10206Ar_210918221245s.nc4.html


In [18]:
# to avoid duplicate records
p=0
complete_oco2_links=[]

for i in range(0, len(dataset_links)):
    try:
        complete_oco2_links.append(dataset_links[i+p])
        p+=1
    # Ignoring the Out of Index error
    except IndexError as e:
        continue

In [19]:
# TESTING: for duplicate records, output: half
len(dataset_links),len(complete_oco2_links)

(1394, 697)

# Other Important links:
* Information on the product

# Using the lists to Retrieve datasets
* Using <b>netCDF</b> library to get the data

In [20]:
# CHECK: JAN to DEC dates on the filenames

complete_oco2_links[:5], print('****') ,complete_oco2_links[-4:]

****


(['oco2_LtCO2_200101_B10206Ar_200728183348s.nc4',
  'oco2_LtCO2_200102_B10206Ar_200728203252s.nc4',
  'oco2_LtCO2_200103_B10206Ar_200728203534s.nc4',
  'oco2_LtCO2_200108_B10206Ar_200728203546s.nc4',
  'oco2_LtCO2_200109_B10206Ar_200728203551s.nc4'],
 None,
 ['oco2_LtCO2_201230_B10206Ar_210614174239s.nc4',
  'oco2_LtCO2_201230_B10206Ar_210922004428s.nc4',
  'oco2_LtCO2_201231_B10206Ar_210614174346s.nc4',
  'oco2_LtCO2_201231_B10206Ar_210922004512s.nc4'])

### TEST
* Attaching the full link + dataset
* my_url= 'https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+ complete_oco2_links[0]

In [21]:
first_element_oco2= nc.Dataset('https://oco2.gesdisc.eosdis.nasa.gov/opendap/'+str(instrument)+'/'+ str(year)+'/'+complete_oco2_links[2])
first_element_oco2.variables.keys()

dict_keys(['xco2_apriori', 'file_index', 'xco2_qf_simple_bitflag', 'pressure_levels', 'xco2', 'time', 'pressure_weight', 'Preprocessors_co2_ratio', 'Preprocessors_max_declocking_sco2', 'Preprocessors_max_declocking_o2a', 'Preprocessors_xco2_strong_idp', 'Preprocessors_max_declocking_wco2', 'Preprocessors_dp_abp', 'Preprocessors_co2_ratio_offset_per_footprint', 'Preprocessors_h2o_ratio', 'Preprocessors_xco2_weak_idp', 'Preprocessors_h2o_ratio_offset_per_footprint', 'solar_zenith_angle', 'longitude', 'xco2_qf_bitflag', 'latitude', 'sensor_zenith_angle', 'Meteorology_psurf_apriori_o2a', 'Meteorology_psurf_apriori_wco2', 'Meteorology_psurf_apriori_sco2', 'Meteorology_windspeed_u_met', 'Meteorology_windspeed_v_met', 'xco2_quality_flag', 'xco2_averaging_kernel', 'date', 'Retrieval_dp_o2a', 'Retrieval_dust_height', 'Retrieval_aod_water', 'Retrieval_s32', 'Retrieval_chi2_sco2', 'Retrieval_aod_dust', 'Retrieval_albedo_slope_wco2', 'Retrieval_aod_bc', 'Retrieval_aod_strataer', 'Retrieval_aod_sea

# NEXT STEP:
* Retrieving all datasets into pandas DataFrame for analysis
* Rearrange the dateTime format

# Creating a Class object:
* Function that reads the variables from the dataset

In [None]:
# class readData():
    
#     # objects to store variables
#     xco2=[]
#     sounding_id=[]
#     latitude= []
#     longitude= []
    
    
#     # initialization
#     def __init__(self):
        