# Task 1.1: Verify Downloaded BeatAML Data Files

**Project:** AML Multi-Omics Integration  
**Date:** 2025-10-02  
**Objective:** Verify that all expected BeatAML data files are present, check their integrity, and report file sizes and basic properties.

---

## 1. Setup and Imports

In [1]:
import os
import sys
from pathlib import Path
from datetime import datetime
import hashlib
import pandas as pd
from IPython.display import display, Markdown

print("Libraries imported successfully")
print(f"Current working directory: {os.getcwd()}")

Libraries imported successfully
Current working directory: D:\Projects\Project_AML\02_Scripts\01_Data_Processing


## 2. Define Expected Files and Functions

In [2]:
# Expected files with their approximate sizes (in MB)
EXPECTED_FILES = {
    'beataml_expression.txt': 269,
    'beataml_drug_auc.txt': 19,
    'beataml_clinical.xlsx': 0.5,
    'beataml_mutations.txt': 3.5,
    'beataml_raw_inhibitor.txt': 48,
    'beataml_drug_families.xlsx': 0.1
}

# Set paths
project_root = Path.cwd().parent.parent  # Adjust as needed
data_dir = project_root / "01_Data" / "BeatAML_Downloaded_Data"
output_log = project_root / "06_Documentation" / "Data_Analysis_Log.txt"

print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"Output log: {output_log}")

Project root: D:\Projects\Project_AML
Data directory: D:\Projects\Project_AML\01_Data\BeatAML_Downloaded_Data
Output log: D:\Projects\Project_AML\06_Documentation\Data_Analysis_Log.txt


In [3]:
def get_file_size_mb(filepath):
    """Get file size in megabytes."""
    try:
        size_bytes = os.path.getsize(filepath)
        size_mb = size_bytes / (1024 * 1024)
        return size_mb
    except OSError as e:
        return None

def check_file_readable(filepath):
    """Check if file can be opened and read."""
    try:
        with open(filepath, 'rb') as f:
            f.read(1024)
        return True
    except Exception as e:
        return False

def calculate_md5(filepath, chunk_size=8192):
    """Calculate MD5 checksum of file (for integrity check)."""
    try:
        md5 = hashlib.md5()
        with open(filepath, 'rb') as f:
            while chunk := f.read(chunk_size):
                md5.update(chunk)
        return md5.hexdigest()
    except Exception as e:
        return None

print("Functions defined successfully")

Functions defined successfully


## 3. Verify Files

In [4]:
print("=" * 80)
print("BeatAML Data Files Verification")
print("=" * 80)
print(f"Data Directory: {data_dir}")
print(f"Verification Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 80)
print()

results = []
all_passed = True

for filename, expected_size_mb in EXPECTED_FILES.items():
    filepath = data_dir / filename
    
    print(f"Checking: {filename}")
    print("-" * 60)
    
    # Check existence
    exists = filepath.exists()
    print(f"  Exists: {'✓ YES' if exists else '✗ NO'}")
    
    if not exists:
        results.append({
            'filename': filename,
            'exists': False,
            'expected_size_mb': expected_size_mb,
            'actual_size_mb': None,
            'readable': False,
            'checksum': None,
            'status': 'MISSING'
        })
        all_passed = False
        print(f"  Status: MISSING")
        print()
        continue
    
    # Check size
    actual_size_mb = get_file_size_mb(filepath)
    print(f"  Expected Size: ~{expected_size_mb} MB")
    print(f"  Actual Size: {actual_size_mb:.2f} MB")
    
    # Size tolerance check (±20%)
    size_ok = (actual_size_mb >= expected_size_mb * 0.8 and
               actual_size_mb <= expected_size_mb * 1.2)
    print(f"  Size Check: {'✓ PASS' if size_ok else '⚠ WARNING'}")
    
    # Check readability
    readable = check_file_readable(filepath)
    print(f"  Readable: {'✓ YES' if readable else '✗ NO'}")
    
    # Calculate checksum (for smaller files only)
    checksum = None
    if actual_size_mb < 50:  # Only for files < 50MB
        print("  Calculating MD5 checksum...")
        checksum = calculate_md5(filepath)
        if checksum:
            print(f"  MD5 Checksum: {checksum[:16]}...")
        else:
            print(f"  MD5 Checksum: Failed")
    
    # Overall status
    if readable and size_ok:
        status = "OK"
        print(f"  Status: ✓ OK")
    elif readable:
        status = "WARNING"
        print(f"  Status: ⚠ WARNING (size mismatch)")
        all_passed = False
    else:
        status = "ERROR"
        print(f"  Status: ✗ ERROR (not readable)")
        all_passed = False
    
    results.append({
        'filename': filename,
        'exists': exists,
        'expected_size_mb': expected_size_mb,
        'actual_size_mb': actual_size_mb,
        'readable': readable,
        'checksum': checksum,
        'status': status
    })
    
    print()

print("File verification complete!")

BeatAML Data Files Verification
Data Directory: D:\Projects\Project_AML\01_Data\BeatAML_Downloaded_Data
Verification Time: 2025-10-26 17:12:29

Checking: beataml_expression.txt
------------------------------------------------------------
  Exists: ✓ YES
  Expected Size: ~269 MB
  Actual Size: 268.30 MB
  Size Check: ✓ PASS
  Readable: ✓ YES
  Status: ✓ OK

Checking: beataml_drug_auc.txt
------------------------------------------------------------
  Exists: ✓ YES
  Expected Size: ~19 MB
  Actual Size: 18.16 MB
  Size Check: ✓ PASS
  Readable: ✓ YES
  Calculating MD5 checksum...
  MD5 Checksum: e17bee701ee42b84...
  Status: ✓ OK

Checking: beataml_clinical.xlsx
------------------------------------------------------------
  Exists: ✓ YES
  Expected Size: ~0.5 MB
  Actual Size: 0.47 MB
  Size Check: ✓ PASS
  Readable: ✓ YES
  Calculating MD5 checksum...
  MD5 Checksum: e66c8e191b7446ab...
  Status: ✓ OK

Checking: beataml_mutations.txt
------------------------------------------------------

## 4. Display Results Summary

In [5]:
# Create results dataframe
results_df = pd.DataFrame(results)

print("=" * 80)
print("VERIFICATION SUMMARY")
print("=" * 80)

ok_count = sum(1 for r in results if r['status'] == 'OK')
warning_count = sum(1 for r in results if r['status'] == 'WARNING')
error_count = sum(1 for r in results if r['status'] in ['ERROR', 'MISSING'])

print(f"Total Files Checked: {len(EXPECTED_FILES)}")
print(f"  ✓ OK: {ok_count}")
print(f"  ⚠ WARNING: {warning_count}")
print(f"  ✗ ERROR/MISSING: {error_count}")
print()

if all_passed:
    print("Overall Status: ✓ ALL CHECKS PASSED")
else:
    print("Overall Status: ⚠ SOME ISSUES DETECTED")

print("=" * 80)

# Display as table
display(results_df[['filename', 'exists', 'expected_size_mb', 'actual_size_mb', 'readable', 'status']])

VERIFICATION SUMMARY
Total Files Checked: 6
  ✓ OK: 5
  ✗ ERROR/MISSING: 0

Overall Status: ⚠ SOME ISSUES DETECTED


Unnamed: 0,filename,exists,expected_size_mb,actual_size_mb,readable,status
0,beataml_expression.txt,True,269.0,268.304043,True,OK
1,beataml_drug_auc.txt,True,19.0,18.164709,True,OK
2,beataml_clinical.xlsx,True,0.5,0.465658,True,OK
3,beataml_mutations.txt,True,3.5,3.499027,True,OK
4,beataml_raw_inhibitor.txt,True,48.0,47.07687,True,OK
5,beataml_drug_families.xlsx,True,0.1,0.056581,True,WARNING


## 5. Save Verification Log

In [6]:
# Create output directory if needed
output_log.parent.mkdir(parents=True, exist_ok=True)

# Write log file
with open(output_log, 'w') as f:
    f.write("BeatAML Data Files Verification Log\n")
    f.write("=" * 80 + "\n")
    f.write(f"Verification Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Data Directory: {data_dir}\n")
    f.write("=" * 80 + "\n\n")
    
    for result in results:
        f.write(f"File: {result['filename']}\n")
        f.write(f"  Exists: {result['exists']}\n")
        f.write(f"  Expected Size: {result['expected_size_mb']} MB\n")
        if result['actual_size_mb']:
            f.write(f"  Actual Size: {result['actual_size_mb']:.2f} MB\n")
        else:
            f.write(f"  Actual Size: N/A\n")
        f.write(f"  Readable: {result['readable']}\n")
        if result.get('checksum'):
            f.write(f"  MD5: {result['checksum']}\n")
        f.write(f"  Status: {result['status']}\n")
        f.write("\n")
    
    f.write("\nSummary:\n")
    f.write(f"  Total Files: {len(EXPECTED_FILES)}\n")
    f.write(f"  OK: {ok_count}\n")
    f.write(f"  WARNING: {warning_count}\n")
    f.write(f"  ERROR/MISSING: {error_count}\n")
    f.write(f"  Overall: {'PASSED' if all_passed else 'ISSUES DETECTED'}\n")

print(f"\n✓ Log file saved to: {output_log}")
print(f"\n✓ Verification complete!")


✓ Log file saved to: D:\Projects\Project_AML\06_Documentation\Data_Analysis_Log.txt

✓ Verification complete!


## 6. Export Results to CSV (Optional)

In [7]:
# Save results as CSV for future reference
csv_output = project_root / "03_Results" / "02_QC_Reports" / "file_verification_results.csv"
csv_output.parent.mkdir(parents=True, exist_ok=True)

results_df.to_csv(csv_output, index=False)
print(f"✓ Results saved to CSV: {csv_output}")

✓ Results saved to CSV: D:\Projects\Project_AML\03_Results\02_QC_Reports\file_verification_results.csv
