In [2]:
import os
import requests
import csv
import numpy as np
from pyhdf.SD import SD, SDC  # Used for MODIS HDF4 files
import time

In [3]:
secure = dict([e.split('=') for e in open('secure.txt', 'r').read().split('\n')])

In [4]:
modis_row_dir = 'modis_rows.txt'
rows = [line.strip() for line in open(modis_row_dir, 'r') if line.strip()]
download_dir = '/tmp/'

In [5]:
def get_earthdata_session():
    session = requests.Session()
    session.auth = (secure['username'], secure['password'])
    resp = session.get("https://urs.earthdata.nasa.gov", allow_redirects=True)
    if resp.status_code != 200:
        print("Failed to authenticate with Earthdata")
    return session

In [6]:
def extract_var_and_wr_csv(file_dir, output_csv_path, original_row):
    files = [f for f in os.listdir(file_dir) if f.endswith('.hdf')]

    dataset_names = [
        'LST_Day_1km', 'QC_Day', 'Day_view_time', 'Day_view_angl',
        'LST_Night_1km', 'QC_Night', 'Night_view_time', 'Night_view_angl',
        'Emis_31', 'Emis_32', 'Clear_sky_days', 'Clear_sky_nights'
    ]

    # Create header names for min, max, mean of each dataset
    stat_fields = []
    for name in dataset_names:
        stat_fields.extend([
            f"{name.lower()}_min",
            f"{name.lower()}_max",
            f"{name.lower()}_mean"
        ])

    headers = ['granule_id', 'original_row', 'product', 'location', 'split', 'granuleSize'] + stat_fields

    os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

    with open(output_csv_path, 'a', newline='') as csvfile:
        csvwriter = csv.DictWriter(csvfile, fieldnames=headers)
        if os.stat(output_csv_path).st_size == 0:
            csvwriter.writeheader()

        for file_name in files:
            file_path = os.path.join(file_dir, file_name)
            row_data = {
                'granule_id': file_name,
                'original_row': original_row,
                'product': 'modis',
                'location': 'northeast',
                'split': 'train',
                'granuleSize': os.path.getsize(file_path),
            }

            try:
                hdf = SD(file_path, SDC.READ)

                for dataset in dataset_names:
                    try:
                        data = hdf.select(dataset)[:]
                        data = np.where(data == 0, np.nan, data)  # Mask zero if needed

                        row_data[f"{dataset.lower()}_min"] = np.nanmin(data)
                        row_data[f"{dataset.lower()}_max"] = np.nanmax(data)
                        row_data[f"{dataset.lower()}_mean"] = np.nanmean(data)
                    except Exception as e:
                        print(f"Couldn't process {dataset} in {file_name}: {e}")
                        row_data[f"{dataset.lower()}_min"] = "NA"
                        row_data[f"{dataset.lower()}_max"] = "NA"
                        row_data[f"{dataset.lower()}_mean"] = "NA"

                csvwriter.writerow(row_data)
                print(f"Written: {file_name}")

            except Exception as e:
                print(f"Error processing {file_name}: {e}")


In [7]:
def os_remove(tmp_dir=download_dir):
    files = [f for f in os.listdir(tmp_dir) if f.endswith('.hdf')]
    for filename in files:
        file_path = os.path.join(tmp_dir, filename)
        try:
            os.remove(file_path)
            print(f"Successfully deleted: {filename}")
        except FileNotFoundError:
            print(f"{filename} does not exist or was already deleted.")

In [None]:
def loadFileMODIS(row_txt_path, download_dir='C:/tmp/', output_csv_path='C:/oqg_proj1/data_tg/modis_features.csv'):
    os.makedirs(download_dir, exist_ok=True)

    with open(row_txt_path, 'r') as f:
        urls = f.read().strip().splitlines()

    session = get_earthdata_session()
    total_files = len(urls)
    times = []

    for i, row in enumerate(urls):
        try:
            outfile = os.path.basename(row)
            outfile_path = os.path.join(download_dir, outfile)
            print(f"\n[{i + 1}/{total_files}] Downloading {outfile}...")

            start_time = time.time()

            with session.get(row, stream=True) as r:
                r.raise_for_status()
                with open(outfile_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024 * 1024):
                        f.write(chunk)

            print(f"Downloaded {outfile}")
            extract_var_and_wr_csv(download_dir, output_csv_path, row)

            end_time = time.time()
            elapsed = end_time - start_time
            times.append(elapsed)

            avg_time = np.mean(times)
            remaining = avg_time * (total_files - (i + 1))
            print(f"Done {i + 1}/{total_files} | Time: {elapsed:.2f}s | Est. remaining: {remaining:.1f}s ({remaining / 60:.1f} min)")

        except requests.RequestException as e:
            print(f"Error downloading {row}: {e}")
        finally:
            os_remove(download_dir)


In [20]:
if __name__ == "__main__":
    loadFileMODIS(modis_row_dir)


[1/5785] Downloading MOD11A2.A2025113.h13v04.061.2025125151119.hdf...
Downloaded MOD11A2.A2025113.h13v04.061.2025125151119.hdf
Written: MOD11A2.A2025057.h11v04.061.2025066040453.hdf
Written: MOD11A2.A2025057.h11v05.061.2025066040222.hdf
Written: MOD11A2.A2025089.h11v04.061.2025098211843.hdf
Written: MOD11A2.A2025097.h11v04.061.2025106042553.hdf
Written: MOD11A2.A2025097.h11v05.061.2025106042015.hdf
Written: MOD11A2.A2025097.h12v04.061.2025106043036.hdf
Written: MOD11A2.A2025097.h12v05.061.2025106042807.hdf
Written: MOD11A2.A2025097.h13v04.061.2025106042813.hdf
Written: MOD11A2.A2025105.h11v04.061.2025114042627.hdf
Written: MOD11A2.A2025105.h11v05.061.2025114041528.hdf
Written: MOD11A2.A2025105.h12v04.061.2025114041743.hdf
Successfully deleted: MOD11A2.A2025057.h11v04.061.2025066040453.hdf
Successfully deleted: MOD11A2.A2025057.h11v05.061.2025066040222.hdf
Successfully deleted: MOD11A2.A2025089.h11v04.061.2025098211843.hdf
Successfully deleted: MOD11A2.A2025097.h11v04.061.2025106042553

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:/tmp/MOD11A2.A2025105.h12v05.061.2025114043048.hdf'