## 📥 Download and Filter NASA GeneLab Omics and Non-Omics Datasets

This notebook automates the retrieval and pre‑processing of omics datasets from the NASA GeneLab Open Science Data Repository (OSDR) using the `genelab_utils` package. It supports both incremental and full updates, applies pre‑filters to reduce file size, and writes a manifest of downloaded files.

Author: Chisom Aniekwensi (sommaniekwensi@gmail.com)

In [1]:
# import modules and packages
import os
import json
import logging
import pandas as pd
import requests
import re
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Any, Optional
from datetime import datetime

In [3]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("nasa_kg")

In [5]:
# File type patterns
OMICS_PATTERNS = ['rna-seq', 'rna_seq', 'transcript', 'microarray', 'methylation', 
                 'proteom', 'metabolom', 'gene', 'seq', 'diff_expr', 'deg']
NON_OMICS_PATTERNS = ['imaging', 'microscopy', 'tomography', 'pressure', 'temperature',
                      'tonometry', 'sensor', 'ultrasonography', 'biotelemetry', 'mri']

# GLDS to OSD mapping
GLDS_TO_OSD = {
    'GLDS-100': 'OSD-100', 'GLDS-162': 'OSD-162', 'GLDS-194': 'OSD-194',
    'GLDS-203': 'OSD-203', 'GLDS-255': 'OSD-255', 'GLDS-397': 'OSD-397',
    'GLDS-87': 'OSD-87'
}

In [7]:
# function to extract OSDR dataset ID from filename.
def extract_dataset_id(filename: str) -> Optional[str]:
    filename_lower = str(filename).lower()
    
    # OSD pattern
    osd_match = re.search(r'(osd-\d+)', filename_lower)
    if osd_match:
        osd_num = re.search(r'osd-(\d+)', osd_match.group(1)).group(1)
        return f"OSD-{osd_num}"
    
    # s_OSD pattern
    s_osd_match = re.search(r's_osd-(\d+)', filename_lower)
    if s_osd_match:
        return f"OSD-{s_osd_match.group(1)}"
    
    # ALSDA pattern
    alsda_match = re.search(r'(alsda[-_]\d+)', filename_lower)
    if alsda_match:
        alsda_num = re.search(r'alsda[-_](\d+)', alsda_match.group(1)).group(1)
        return f"ALSDA-{alsda_num}"
    
    # GLDS pattern and mapping
    glds_match = re.search(r'(glds-\d+)', filename_lower)
    if glds_match:
        glds_num = re.search(r'glds-(\d+)', glds_match.group(1)).group(1)
        glds_id = f"GLDS-{glds_num}"
        return GLDS_TO_OSD.get(glds_id)
    
    return None


In [9]:
# function to detect if file is omics, non-omics, or metadata

def detect_file_type(file_path: str) -> tuple:
    filename = str(file_path).lower()
    dataset_id = extract_dataset_id(filename)
    
    # Check if this is a metadata file (s_OSD)
    if 's_osd' in filename:
        return ('metadata', dataset_id, 's_file_pattern')
    
    # Check for omics patterns
    if any(pattern in filename for pattern in OMICS_PATTERNS):
        return ('omics', dataset_id, 'omics_pattern')
    
    # Check for non-omics patterns
    if any(pattern in filename for pattern in NON_OMICS_PATTERNS):
        return ('non_omics', dataset_id, 'non_omics_pattern')
    
    # ALSDA files are typically omics
    if 'alsda' in filename:
        return ('omics', dataset_id, 'alsda_file')
    
    return ('unknown', dataset_id, 'fallback')

In [11]:
# function to fetch basic metadata for an OSDR dataset

def get_osdr_metadata(accession: str) -> Dict:
    url = f"https://visualization.osdr.nasa.gov/biodata/api/v2/dataset/{accession}/?format=json"
    try:
        response = requests.get(url, timeout=60)
        if response.status_code != 200:
            logger.warning(f"Failed to get metadata for {accession}: Status {response.status_code}")
            return {"identifier": accession, "error": f"HTTP {response.status_code}"}
        
        data = response.json()
        ds = data.get(accession, {})
        meta = ds.get("metadata", {})
        
        # Create standardized metadata structure
        metadata = {
            "identifier": accession,
            "study_title": meta.get("project title", ""),
            "organism": meta.get("organism", ""),
            "mission": meta.get("mission", {}).get("name", "")
        }
        
        return metadata
    except Exception as e:
        logger.error(f"Error fetching metadata for {accession}: {str(e)}")
        return {"identifier": accession, "error": str(e)}

In [13]:
# function to safely read a file into a DataFrame, handling various formats and issues

def safe_read_file(file_path: Path) -> pd.DataFrame:
    try:
        if file_path.suffix.lower() in ['.csv', '.txt', '.tsv']:
            # First try with standard parameters
            try:
                return pd.read_csv(file_path, on_bad_lines='skip', low_memory=False)
            except Exception as e1:
                # If that fails, try with more flexible parameters
                try:
                    return pd.read_csv(file_path, sep=None, engine='python', on_bad_lines='skip')
                except Exception as e2:
                    # Last resort: try to read as text and parse manually
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        lines = f.readlines()
                    
                    # Create a simple DataFrame with the text content
                    return pd.DataFrame({'content': lines})
                    
        elif file_path.suffix.lower() in ['.xlsx', '.xls']:
            # Try to read Excel files
            try:
                return pd.read_excel(file_path)
            except Exception as e:
                # If Excel reading fails, try to read as binary and create placeholder
                return pd.DataFrame({'filename': [file_path.name], 
                                    'error': [f'Could not parse Excel: {str(e)}']})
        else:
            # For unsupported formats, create a simple DataFrame
            return pd.DataFrame({'filename': [file_path.name], 
                                'format': [file_path.suffix]})
    except Exception as e:
        # If all else fails, return an empty DataFrame with error info
        return pd.DataFrame({'filename': [file_path.name], 
                            'error': [f'File reading error: {str(e)}']})

In [15]:
# function to DataFrame safely, handling MultiIndex and other issues

def add_metadata(df: pd.DataFrame, metadata_dict: Dict) -> pd.DataFrame:
    try:
        # If DataFrame is empty, return it with metadata as first row
        if df.empty:
            return pd.DataFrame([metadata_dict])
        
        # Check if df has MultiIndex
        if isinstance(df.index, pd.MultiIndex):
            # Reset index to avoid MultiIndex issues
            df_reset = df.reset_index()
            
            # Add metadata columns one by one
            for key, value in metadata_dict.items():
                df_reset[key] = value
                
            return df_reset
        else:
            # For regular DataFrames, create a metadata DataFrame with same length
            metadata_df = pd.DataFrame([metadata_dict] * len(df))
            
            # Add a temporary index column to both DataFrames
            df['_temp_idx'] = range(len(df))
            metadata_df['_temp_idx'] = range(len(df))
            
            # Merge on the temp index
            result = pd.merge(df, metadata_df, on='_temp_idx')
            
            # Remove the temporary index
            result = result.drop('_temp_idx', axis=1)
            
            return result
    except Exception as e:
        # If adding metadata fails, create a new DataFrame with original data and metadata
        logger.warning(f"Error adding metadata: {str(e)}. Creating new DataFrame.")
        
        # Create a simple summary of the original DataFrame
        summary = {
            'original_columns': list(df.columns),
            'row_count': len(df),
            'issue': str(e)
        }
        
        # Combine with metadata
        combined_dict = {**metadata_dict, **summary}
        
        # First row is metadata/summary, rest is empty
        result = pd.DataFrame([combined_dict])
        
        return result


In [17]:
# function to process a single file and save to output directory, handling various edge cases

def process_file(file_path: Path, output_dir: Path, metadata: Dict = None) -> Dict:
    # Detect file type
    file_type, dataset_id, detection_method = detect_file_type(str(file_path))
    
    # Create basic result structure
    result = {
        "file_path": str(file_path),
        "file_type": file_type,
        "dataset_id": dataset_id,
        "detection_method": detection_method,
    }
    
    try:
        # Read the file safely
        df = safe_read_file(file_path)
        
        # Create metadata dictionary
        metadata_dict = {
            "file_source": str(file_path.name),
            "file_type": file_type,
            "dataset_id": dataset_id,
        }
        
        # Add OSDR metadata if available
        if metadata and dataset_id in metadata:
            study_meta = metadata[dataset_id]
            metadata_dict["study_title"] = study_meta.get("study_title", "")
            metadata_dict["organism"] = study_meta.get("organism", "")
            metadata_dict["mission"] = study_meta.get("mission", "")
        
        # Add metadata to the DataFrame
        result_df = add_metadata(df, metadata_dict)
        
        # Create output path
        output_subdir = output_dir / file_type
        output_subdir.mkdir(exist_ok=True, parents=True)
        output_filename = f"{file_path.stem}_processed.csv"
        output_path = output_subdir / output_filename
        
        # Save processed file
        result_df.to_csv(output_path, index=False)
        
        # Update result with success info
        result.update({
            "output_path": str(output_path),
            "rows": len(df),
            "columns": len(df.columns),
            "processed": True
        })
        
        return result
        
    except Exception as e:
        logger.error(f"Error processing {file_path}: {str(e)}")
        result["error"] = str(e)
        return result

In [19]:
# function to process NASA OSDR data files with robust error handling

def process_nasa_data(data_dir=None, output_dir=None, max_files=None, parallel=True):
    # Set up directories
    data_dir = Path(data_dir) if data_dir else Path.cwd() / "data"
    output_dir = Path(output_dir) if output_dir else Path.cwd() / "processed"
    output_dir.mkdir(exist_ok=True, parents=True)
    
    logger.info(f"Processing NASA OSDR data from {data_dir}")
    
    # Scan for files
    extensions = ['.csv', '.txt', '.xlsx', '.xls', '.tsv']
    all_files = []
    for ext in extensions:
        all_files.extend(list(data_dir.glob(f"**/*{ext}")))
    
    # Sort files (metadata first, then data files)
    all_files.sort(key=lambda f: 0 if 's_osd' in str(f).lower() else 1)
    
    # Limit number of files if requested
    if max_files:
        all_files = all_files[:max_files]
    
    if not all_files:
        logger.warning(f"No files found in {data_dir}")
        return {"status": "warning", "message": "No files found"}
    
    logger.info(f"Found {len(all_files)} files to process")
    
    # Extract dataset IDs and fetch metadata
    dataset_ids = set()
    for file in all_files:
        dataset_id = extract_dataset_id(str(file))
        if dataset_id:
            dataset_ids.add(dataset_id)
    
    logger.info(f"Found {len(dataset_ids)} unique dataset IDs")
    
    # Fetch metadata for all datasets
    metadata = {}
    for dataset_id in dataset_ids:
        metadata[dataset_id] = get_osdr_metadata(dataset_id)
    
    # Create subdirectories for file types
    for dir_name in ['metadata', 'omics', 'non_omics', 'unknown', 'error']:
        (output_dir / dir_name).mkdir(exist_ok=True, parents=True)
    
    # Process files
    results = []
    
    # Parallel processing
    if parallel:
        with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 4, 6)) as executor:
            futures = [executor.submit(process_file, file, output_dir, metadata) for file in all_files]
            for future in futures:
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    logger.error(f"Error in thread: {str(e)}")
                    results.append({"file_path": "unknown", "file_type": "error", "error": str(e)})
    # Sequential processing (for easier debugging)
    else:
        for file in all_files:
            try:
                result = process_file(file, output_dir, metadata)
                results.append(result)
            except Exception as e:
                logger.error(f"Error processing file: {str(e)}")
                results.append({"file_path": str(file), "file_type": "error", "error": str(e)})
    
    # Create manifest DataFrame
    manifest_df = pd.DataFrame(results)
    
    # Add success/error counts
    success_count = sum(1 for result in results if not result.get('error'))
    error_count = sum(1 for result in results if result.get('error'))
    logger.info(f"Successfully processed {success_count} files. Errors: {error_count}")
    
    # Save manifest
    manifest_path = output_dir / "manifest.csv"
    manifest_df.to_csv(manifest_path, index=False)
    
    # Create summary
    summary = {
        "total_files": len(results),
        "success_count": success_count,
        "error_count": error_count,
        "dataset_ids": list(dataset_ids),
    }
    
    # Add file type counts if possible
    if 'file_type' in manifest_df.columns:
        file_types = manifest_df['file_type'].value_counts().to_dict()
        summary["file_type_counts"] = file_types
    
    # Save summary
    summary_path = output_dir / "summary.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    
    logger.info(f"Processing complete. Results saved to {output_dir}")
    
    return {
        "manifest_df": manifest_df,
        "summary": summary,
        "manifest_path": str(manifest_path),
        "summary_path": str(summary_path),
        "success_rate": f"{success_count}/{len(results)} ({success_count/len(results)*100:.1f}%)"
    }


In [21]:
# Print result
result = process_nasa_data(
    data_dir="C:/Users/QUCOON/Documents/Chisom_Personal_Doc/NASA_KnowHax_2025/data", 
    output_dir="C:/Users/QUCOON/Documents/Chisom_Personal_Doc/NASA_KnowHax_2025/processed",
    parallel=True,  # Set to False for debugging
    max_files=None  # Set to None to process all files
)
display(result["manifest_df"].head())
print(f"Success rate: {result['success_rate']}")

2025-05-06 14:45:02,763 - INFO - Processing NASA OSDR data from C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\data
2025-05-06 14:45:02,921 - INFO - Found 147 files to process
2025-05-06 14:45:02,921 - INFO - Found 17 unique dataset IDs
  warn("""Cannot parse header or footer so it will be ignored""")
2025-05-06 14:48:17,946 - INFO - Successfully processed 147 files. Errors: 0
2025-05-06 14:48:17,972 - INFO - Processing complete. Results saved to C:\Users\QUCOON\Documents\Chisom_Personal_Doc\NASA_KnowHax_2025\processed


Unnamed: 0,file_path,file_type,dataset_id,detection_method,output_path,rows,columns,processed
0,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,metadata,OSD-100,s_file_pattern,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,12,57,True
1,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,metadata,OSD-162,s_file_pattern,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,21,89,True
2,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,metadata,OSD-194,s_file_pattern,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,13,73,True
3,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,metadata,OSD-203,s_file_pattern,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,59,64,True
4,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,metadata,OSD-255,s_file_pattern,C:\Users\QUCOON\Documents\Chisom_Personal_Doc\...,16,73,True


Success rate: 147/147 (100.0%)
