<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

def check_reproduceworkdir():
    """
    Check if REPRODUCEWORKDIR is defined and if not prompts the user to do so.
    Check if the following files exist: ${REPRODUCEWORKDIR}/config.toml and the required fieids: ['repro', 'repro.input','project']
    """
    # Check if REPRODUCEWORKDIR is defined
    if 'REPRODUCEWORKDIR' not in os.environ:
        print("REPRODUCEWORKDIR environment variable is not defined.")
        print("Please define it using the following command:")
        print("export REPRODUCEWORKDIR=/path/to/reproduceworkdir")
        return False
    
    # Check if the directory exists
    if not os.path.isdir(os.environ['REPRODUCEWORKDIR']):
        print(f"{os.environ['REPRODUCEWORKDIR']} does not exist.")
        return False
    
    # Check if the config.toml file exists
    config_file = os.path.join(os.environ['REPRODUCEWORKDIR'], 'config.toml')
    if not os.path.isfile(config_file):
        print(f"{config_file} does not exist.")
        return False
    
    # Check if the required fields exist in the config.toml file
    with open(config_file, 'r') as f:
        config = f.read()
    if 'repro' not in config:
        print("repro field is missing in the config.toml file.")
        return False
    if 'repro.input' not in config:
        print("repro.input field is missing in the config.toml file.")
        return False
    if 'project' not in config:
        print("project field is missing in the config.toml file.")
        return False
    
    return True

In [6]:
import yaml
import hashlib
import os
import toml

def verify_metadata():
    # Check if reproducibility.yml exists
    if not os.path.exists('reproducibility.yml'):
        return False, "reproducibility.yml not found."

    with open('reproducibility.yml', 'r') as f:
        metadata = yaml.safe_load(f)
    
    # Check software version (mocked for demonstration)
    software_name = metadata['software'][0]['name']
    
    # Check dataset checksum
    dataset_name = metadata['datasets'][0]['name']
    expected_checksum = metadata['datasets'][0]['checksum']
    
    if not os.path.exists(dataset_name):
        return False, f"Dataset {dataset_name} not found."
    
    with open(dataset_name, 'rb') as f:
        bytes = f.read()
        readable_hash = hashlib.sha256(bytes).hexdigest()
        
    if f"sha256:{readable_hash}" != expected_checksum:
        return False, "Dataset checksum mismatch."
    
    # Check presence of instructions
    instructions_path = metadata['instructions']['path']
    if not os.path.exists(instructions_path):
        return False, "Instructions for reproduction not found."
    
    # Check for report file
    report_file = metadata.get('report_file')
    if report_file and not os.path.exists(report_file):
        return False, f"Report file {report_file} not found."
    
    # Check for repro.input field in config.toml
    config_file = os.path.join(os.environ['REPRODUCEWORKDIR'], 'config.toml')
    if not os.path.isfile(config_file):
        return False, f"{config_file} does not exist."
    
    with open(config_file, 'r') as f:
        config = toml.load(f)
    
    if 'repro.input' not in config:
        return False, "repro.input field is missing in the config.toml file."
    
    # Check that all files in repro.input exist and are published in pubdata.toml
    pubdata_file = os.path.join(os.environ['REPRODUCEWORKDIR'], 'pubdata.toml')
    if not os.path.isfile(pubdata_file):
        return False, f"{pubdata_file} does not exist."
    
    with open(pubdata_file, 'r') as f:
        pubdata = toml.load(f)
    
    for file in config['repro.input']:
        if not os.path.exists(file):
            return False, f"{file} not found in repro.input."
        
        if file not in pubdata:
            return False, f"{file} not found in pubdata.toml."
        
        file_metadata = pubdata[file]
        if 'hash' not in file_metadata or 'timed_hash' not in file_metadata:
            return False, f"{file} missing hash or timed_hash in pubdata.toml."
        
        generating_script = file_metadata.get('generating_script')
        if generating_script and not os.path.exists(generating_script):
            return False, f"Generating script {generating_script} not found for {file}."
    
    return True, "Verification successful."

ModuleNotFoundError: No module named 'yaml'

In [7]:
# Mocking the presence of metadata and dataset for testing
with open('reproducibility.yml', 'w') as f:
    yaml.dump({
        'software': [{'name': 'Python', 'version': '3.8.5'}],
        'datasets': [{'name': 'sample_data.csv', 'checksum': 'sha256:abcd1234'}],
        'execution_environment': {'docker_image': 'repo_name:latest'},
        'instructions': {'path': 'REPRODUCE.md'}
    }, f)

# Creating a mock dataset and instructions file
with open('sample_data.csv', 'w') as f:
    f.write('sample,data')
with open('REPRODUCE.md', 'w') as f:
    f.write('# Reproduction Instructions\nFollow these steps...')

# Testing the verification script
verify_metadata()


def docker_verification():
    """Check for the presence of Dockerfile or docker-compose.yml."""
    if os.path.exists('Dockerfile') or os.path.exists('docker-compose.yml'):
        return True
    return False

def further_enhanced_verify_metadata():
    # Basic metadata checks
    success, message = verify_metadata()
    if not success:
        return False, message
    
    # Docker verification
    if not docker_verification():
        return False, "Neither Dockerfile nor docker-compose.yml found. Execution environment cannot be verified."
    
    return True, "Verification successful."

# Mocking the presence of Dockerfile for testing (you can also mock docker-compose.yml similarly)
#with open('Dockerfile', 'w') as f:
#    f.write('# Sample Dockerfile\nFROM python:3.8.5')

# Testing the further enhanced verification function
further_enhanced_verify_metadata()

NameError: name 'yaml' is not defined

In [None]:
import re
import toml

def parse_insert_patterns(report_file):
    """Parse the report.reproduce file to identify all instances of the \INSERT{*} pattern."""
    
    with open(report_file, 'r') as f:
        content = f.read()
    
    # Find all \INSERT patterns
    patterns = re.findall(r'\\INSERT\{(.*?)\}', content)
    
    return patterns

# Mocking the report.reproduce file for testing
with open('report.reproduce', 'w') as f:
    f.write("This is a sample report with some variables like \\INSERT{var1} and \\INSERT{var2}.")

# Testing the parsing function
insert_patterns = parse_insert_patterns('report.reproduce')
insert_patterns

In [None]:
def verify_variables_against_pubdata(variables, pubdata_file):
    """Verify that each variable from the INSERT patterns is present in the pubdata.toml file."""
    
    # Load the pubdata.toml file
    pubdata = toml.load(pubdata_file)
    
    missing_variables = [var for var in variables if var not in pubdata]
    
    return len(missing_variables) == 0, missing_variables

In [None]:
# Testing the verification function
success, missing_vars = verify_variables_against_pubdata(insert_patterns, 'pubdata.toml')
success, missing_vars

In [None]:
def replace_patterns_with_values(report_file, pubdata_file, output_file):
    """Replace the INSERT patterns in report.reproduce with corresponding values from pubdata.toml."""
    
    # Load the pubdata.toml file
    pubdata = toml.load(pubdata_file)
    
    with open(report_file, 'r') as f:
        content = f.read()
    
    # Replace each pattern with its value from pubdata
    for var, value in pubdata.items():
        content = content.replace(f'\\INSERT{{{var}}}', str(value))
    
    # Save the replaced content to output_file
    with open(output_file, 'w') as f:
        f.write(content)

In [None]:
# Testing the replacement function
replace_patterns_with_values('report.reproduce', 'pubdata.toml', 'report.tmp')

# Displaying the report.tmp for verification
with open('report.tmp', 'r') as f:
    replaced_content = f.read()

replaced_content

In [None]:
def compare_hashes(file1, file2):
    """Compare the SHA-256 hashes of two files."""
    
    # Calculate hash for file1
    with open(file1, 'rb') as f:
        bytes = f.read()
        hash_file1 = hashlib.sha256(bytes).hexdigest()
    
    # Calculate hash for file2
    with open(file2, 'rb') as f:
        bytes = f.read()
        hash_file2 = hashlib.sha256(bytes).hexdigest()
    
    return hash_file1 == hash_file2

# Mocking the report.auto and original report_file for testing
with open('report.auto', 'w') as f:
    f.write(replaced_content)
with open('original_report_file', 'w') as f:
    f.write(replaced_content)

# Testing the hash comparison function
hashes_match = compare_hashes('report.auto', 'original_report_file')
hashes_match