# Task 1.2: Inspect BeatAML Data Files in Detail

**Project:** AML Multi-Omics Integration  
**Date:** 2025-10-02  
**Objective:** Comprehensive inspection of each downloaded BeatAML file

This notebook will analyze:
- File format and structure
- Dimensions (rows √ó columns)
- Column names and data types
- First 10 rows preview
- Basic statistics
- Missing data analysis

---

## 1. Setup and Imports

In [1]:
import os
import sys
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, Markdown, HTML

print("‚úì Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

‚úì Libraries imported successfully
Pandas version: 2.3.2
NumPy version: 2.3.1


In [2]:
# Set paths
project_root = Path.cwd().parent.parent
data_dir = project_root / "01_Data" / "BeatAML_Downloaded_Data"
output_dir = project_root / "03_Results" / "02_QC_Reports"
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"Output directory: {output_dir}")

Project root: D:\Projects\Project_AML
Data directory: D:\Projects\Project_AML\01_Data\BeatAML_Downloaded_Data
Output directory: D:\Projects\Project_AML\03_Results\02_QC_Reports


## 2. Define Helper Functions

In [3]:
def format_bytes(size):
    """Convert bytes to human readable format."""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if size < 1024.0:
            return f"{size:.2f} {unit}"
        size /= 1024.0
    return f"{size:.2f} TB"

def get_basic_stats(df, col):
    """Get basic statistics for a column."""
    stats = {}
    try:
        if pd.api.types.is_numeric_dtype(df[col]):
            stats['min'] = df[col].min()
            stats['max'] = df[col].max()
            stats['mean'] = df[col].mean()
            stats['median'] = df[col].median()
        else:
            stats['unique'] = df[col].nunique()
            top_val = df[col].value_counts().head(1)
            if len(top_val) > 0:
                stats['top_value'] = top_val.index[0]
                stats['top_count'] = top_val.values[0]
    except:
        pass
    return stats

print("‚úì Helper functions defined")

‚úì Helper functions defined


## 3. File Inspection Function

In [None]:
def inspect_file(filepath, filename):
    """Inspect a single data file and display results."""
    
    display(Markdown(f"\n---\n## üìä {filename}\n---"))
    
    # File size
    file_size = os.path.getsize(filepath)
    print(f"üìÅ File Size: {format_bytes(file_size)}")
    
    # Determine file type and read
    try:
        if filename.endswith('.xlsx'):
            print("üìã File Format: Excel (.xlsx)")
            df = pd.read_excel(filepath)
        elif filename.endswith('.txt'):
            # Try to detect delimiter
            with open(filepath, 'r') as f:
                first_line = f.readline()
                delimiter = '\t' if '\t' in first_line else ','
            
            format_name = "Tab-delimited" if delimiter == '\t' else "Comma-separated"
            print(f"üìã File Format: {format_name} text")
            df = pd.read_csv(filepath, sep=delimiter, low_memory=False)
        else:
            print("‚ùå Unknown file format")
            return None
    
    except Exception as e:
        print(f"‚ùå ERROR reading file: {e}")
        return None
    
    # Dimensions
    n_rows, n_cols = df.shape
    print(f"üìê Dimensions: {n_rows:,} rows √ó {n_cols:,} columns")
    print()
    
    # Column information table
    display(Markdown("### Column Information"))
    
    col_info = []
    for col in df.columns:
        dtype = str(df[col].dtype)
        n_missing = df[col].isna().sum()
        pct_missing = (n_missing / len(df)) * 100
        
        stats = get_basic_stats(df, col)
        
        if 'min' in stats:
            info = f"Range: [{stats['min']:.2f}, {stats['max']:.2f}], Mean: {stats['mean']:.2f}"
        elif 'unique' in stats:
            info = f"{stats['unique']:,} unique values"
        else:
            info = "-"
        
        col_info.append({
            'Column': col[:40] + '...' if len(col) > 40 else col,
            'Type': dtype,
            'Missing': f"{n_missing:,} ({pct_missing:.1f}%)",
            'Info': info
        })
    
    col_df = pd.DataFrame(col_info)
    display(col_df)
    
    # First 10 rows preview
    display(Markdown("### First 10 Rows Preview"))
    display(df.head(10))
    
    # Overall missing data
    total_cells = n_rows * n_cols
    total_missing = df.isna().sum().sum()
    pct_missing = (total_missing / total_cells) * 100
    
    print(f"\nüìä Overall Missing Data: {total_missing:,} / {total_cells:,} ({pct_missing:.2f}%)")
    
    # Data quality issues
    display(Markdown("### Data Quality Check"))
    
    quality_issues = []
    
    # Check for duplicate rows
    n_duplicates = df.duplicated().sum()
    if n_duplicates > 0:
        quality_issues.append(f"‚ö†Ô∏è {n_duplicates:,} duplicate rows found")
    else:
        quality_issues.append("‚úÖ No duplicate rows")
    
    # Check for columns with all missing data
    all_missing_cols = [col for col in df.columns if df[col].isna().all()]
    if all_missing_cols:
        quality_issues.append(f"‚ö†Ô∏è {len(all_missing_cols)} columns with all missing data")
    else:
        quality_issues.append("‚úÖ No columns with all missing data")
    
    # Check for columns with >50% missing
    high_missing_cols = [col for col in df.columns if (df[col].isna().sum() / len(df)) > 0.5]
    if high_missing_cols:
        quality_issues.append(f"‚ö†Ô∏è {len(high_missing_cols)} columns with >50% missing data")
        for col in high_missing_cols[:5]:  # Show first 5
            pct = (df[col].isna().sum() / len(df)) * 100
            quality_issues.append(f"  ‚Ä¢ {col} ({pct:.1f}% missing)")
        if len(high_missing_cols) > 5:
            quality_issues.append(f"  ‚Ä¢ ... and {len(high_missing_cols)-5} more")
    else:
        quality_issues.append("‚úÖ No columns with >50% missing data")
    
    for issue in quality_issues:
        print(issue)
    
    return df

print("‚úì Inspection function defined")

## 4. Inspect All Files

In [None]:
# Files to inspect
files_to_inspect = [
    'beataml_expression.txt',
    'beataml_drug_auc.txt',
    'beataml_clinical.xlsx',
    'beataml_mutations.txt',
    'beataml_raw_inhibitor.txt',
    'beataml_drug_families.xlsx'
]

display(Markdown(f"# BeatAML Data Files - Detailed Inspection"))
display(Markdown(f"**Inspection Time:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"))
display(Markdown(f"**Total Files:** {len(files_to_inspect)}"))

# Store loaded dataframes
data_dict = {}

for filename in files_to_inspect:
    filepath = data_dir / filename
    if filepath.exists():
        df = inspect_file(filepath, filename)
        if df is not None:
            # Store with simplified name
            key = filename.replace('beataml_', '').replace('.txt', '').replace('.xlsx', '')
            data_dict[key] = df
    else:
        display(Markdown(f"\n---\n## ‚ùå {filename}\n**Status:** File not found\n---"))

print(f"\n‚úÖ Inspection complete! Loaded {len(data_dict)} datasets.")

## 5. Summary Statistics Across All Files

In [None]:
display(Markdown("## üìà Summary Across All Files"))

summary_data = []

for key, df in data_dict.items():
    n_rows, n_cols = df.shape
    total_cells = n_rows * n_cols
    total_missing = df.isna().sum().sum()
    pct_missing = (total_missing / total_cells) * 100 if total_cells > 0 else 0
    
    summary_data.append({
        'Dataset': key,
        'Rows': f"{n_rows:,}",
        'Columns': n_cols,
        'Total Cells': f"{total_cells:,}",
        'Missing (%)': f"{pct_missing:.2f}%"
    })

summary_df = pd.DataFrame(summary_data)
display(summary_df)

print(f"\n‚úÖ Total datasets analyzed: {len(data_dict)}")

## 6. Save Summary Report

In [None]:
# Note: Detailed text report is generated by the Python script
# Here we save a summary CSV

summary_csv = output_dir / "data_files_summary.csv"
summary_df.to_csv(summary_csv, index=False)

print(f"‚úÖ Summary saved to: {summary_csv}")
print(f"\nüìù For detailed inspection report, run: python 02_inspect_data.py")

## 7. Data Access

All datasets are now loaded and available in the `data_dict` dictionary:

- `data_dict['expression']` - Gene expression data
- `data_dict['drug_auc']` - Drug response AUC values
- `data_dict['clinical']` - Clinical annotations
- `data_dict['mutations']` - Mutation calls
- `data_dict['raw_inhibitor']` - Raw drug response data
- `data_dict['drug_families']` - Drug family information

In [None]:
# Example: Access a specific dataset
print("Available datasets:")
for key in data_dict.keys():
    print(f"  - {key}: {data_dict[key].shape}")