In [33]:
import os
import re
import shutil
from collections import defaultdict
from datetime import datetime

# Define the metadata structure
metadata_structure = {
    'cc_main_name': {'type': 'string', 'mandatory': True},
    'cc_main_full': {'type': 'string', 'mandatory': False},
    'cc_alt_name': {'type': 'string', 'mandatory': False},
    'cc_alt_full': {'type': 'string', 'mandatory': False},
    'sc_main_name': {'type': 'string', 'mandatory': True},
    'sc_alt_name': {'type': 'string', 'mandatory': False},
    'exec_date': {'type': 'string', 'mandatory': False},
    'proj_main_name': {'type': 'string', 'mandatory': False},
    'proj_alt_name': {'type': 'string', 'mandatory': False},
    'pi_name': {'type': 'string', 'mandatory': True},
    'pi_email': {'type': 'string', 'mandatory': False},
    'sim_bibcode': {'type': 'string', 'mandatory': False},
    'sim_queue': {'type': 'string', 'mandatory': True},
    'sim_nnodes': {'type': 'int', 'mandatory': True},
    'sim_nmpi': {'type': 'int', 'mandatory': True},
    'sim_ncpu': {'type': 'int', 'mandatory': True},
    'sim_ngpu': {'type': 'int', 'mandatory': False},
    'sim_nthreads_total': {'type': 'int', 'mandatory': True},
    'sim_nvector': {'type': 'int', 'mandatory': False},
    'sim_cpu_compiler': {'type': 'string', 'mandatory': True},
    'sim_accel_compiler': {'type': 'string', 'mandatory': False},
    'sim_modules': {'type': 'string', 'mandatory': False},
    'ori_file_name': {'type': 'string', 'mandatory': True},
    'alt_file_name': {'type': 'string', 'mandatory': False},
    'db_file_name': {'type': 'string', 'mandatory': False},
    'db_path': {'type': 'string', 'mandatory': False}
}

def extract_metadata(file_path):
    metadata = {key: ('' if value['type'] == 'string' else 0) for key, value in metadata_structure.items()}
    with open(file_path, 'r') as file:
        content = file.read()
        header_match = re.search(r'########\s*#HEADER\s*########(.*?)#######\s*#SCRIPT\s*#######', content, re.DOTALL)
        if header_match:
            header = header_match.group(1)
            lines = header.strip().split('\n')
            for line in lines:
                if '=' in line:
                    key, value = line.split('=', 1)
                    key = key.strip()
                    value = value.strip()
                    if key in metadata_structure:
                        if metadata_structure[key]['type'] == 'int':
                            try:
                                metadata[key] = int(value)
                            except ValueError:
                                print(f"Warning: Could not convert {key} to int in file {file_path}")
                        else:
                            metadata[key] = value
    return metadata

def validate_metadata(metadata, file_name):
    errors = []
    for key, value in metadata_structure.items():
        if value['mandatory'] and metadata[key] == '':
            errors.append(f"Mandatory field '{key}' is missing in file '{file_name}'")
    return errors

def process_files(directory):
    database = defaultdict(list)
    file_metadata = {}
    for filename in os.listdir(directory):
        if filename.endswith('.sh') or filename.endswith('.lsf'):
            file_path = os.path.join(directory, filename)
            metadata = extract_metadata(file_path)
            errors = validate_metadata(metadata, filename)
            if errors:
                print(f"Errors in file '{filename}':")
                for error in errors:
                    print(f"  - {error}")
            file_metadata[filename] = metadata
            for key, value in metadata.items():
                database[key].append(value)
    database['files'] = file_metadata
    return database

def organize_files(database, source_dir, target_dir):
    for filename, metadata in database['files'].items():
        # Extract relevant metadata
        cc_name = metadata['cc_main_name']
        computer_name = metadata['sc_main_name']
        exec_date = metadata['exec_date']
        compiler = metadata['sim_cpu_compiler']

        # Determine execution year
        try:
            exec_year = datetime.strptime(exec_date, '%d-%m-%Y').year
        except ValueError:
            print(f"Warning: Invalid execution date format for file {filename}. Using 'unknown' as year.")
            exec_year = 'unknown'

        # Create directory structure
        dir_path = os.path.join(target_dir, cc_name, computer_name, str(exec_year), compiler)
        os.makedirs(dir_path, exist_ok=True)

        # Copy file to new location
        source_file = os.path.join(source_dir, filename)
        target_file = os.path.join(dir_path, filename)
        shutil.copy2(source_file, target_file)
        print(f"Copied {filename} to {dir_path}")

# Main execution
if __name__ == "__main__":
    incoming_directory = "../../INCOMING"  # Adjust this path if needed
    database = process_files(incoming_directory)
    
    print("\nComputing centers represented in the sample:")
    print(database['cc_main_name'])

    print("\nAll supercomputer names:")
    print(database['sc_main_name'])

    print("\nAll metadata:")
    for key in metadata_structure:
        print(f"{key}: {database[key]}")

    print("\nNumber of files processed:", len(database['files']))

    print("\nExample of complete metadata for one file:")
    example_file = next(iter(database['files']))
    print(f"File: {example_file}")
    for key, value in database['files'][example_file].items():
        print(f"  {key}: {value}")

    # Organize files into static directory structure
    target_directory = "../../STATIC_DB"  # Adjust this path as needed
    organize_files(database, incoming_directory, target_directory)

    print("\nFiles have been organized into the static directory structure.")


Computing centers represented in the sample:
['LUNARC', 'LUMI', 'Durham University', 'OLCF', 'BSC']

All supercomputer names:
['COSMOS', 'LUMI', 'COSMA8', 'Summit', 'Mare Nostrum']

All metadata:
cc_main_name: ['LUNARC', 'LUMI', 'Durham University', 'OLCF', 'BSC']
cc_main_full: ['The Centre for Scientific and Technical Computing at Lund University', 'LUMI', 'Durham University', 'Oak Ridge Leadership Computing Facility', 'Barcelona Supercomputing Center']
cc_alt_name: ['', '', '', '', '']
cc_alt_full: ['', '', '', '', '']
sc_main_name: ['COSMOS', 'LUMI', 'COSMA8', 'Summit', 'Mare Nostrum']
sc_alt_name: ['', '', '', '', '']
exec_date: ['14-10-2024', '14-10-2024', '14-10-2024', '21-01-2021', '14-10-2024']
proj_main_name: ['Vintergatan 2', 'Vintergatan 2', 'MEGATRON', 'Cosmic Dawn III', 'Vintergatan 2']
proj_alt_name: ['VG2', 'VG2', '', 'CoDaIII', 'VG2']
pi_name: ['Corentin Cadiou', 'Corentin Cadiou', 'Corentin Cadiou', 'Pierre OCVIRK', 'Corentin Cadiou']
pi_email: ['corentin.cadiou@iap.f