# Test Normalization Function
This notebook tests `norm_ip()` for data normalization and imputation.

**Input:** Loads `data_after_qc.pkl` (QC-approved samples)  
**Output:** Saves `data_after_norm.pkl` (normalized data)

In [None]:
# Cell 1: Imports
import sys
sys.path.append('..')

from ipms.analysis import load_data, norm_ip
import pandas as pd
import numpy as np

print("✓ All imports successful!")




# Cell 2: Load QC-Approved Data
data = load_data('../results/data_after_qc.pkl')

print(f"\n✓ Loaded {data['metadata']['n_proteins']} proteins")
print(f"Samples: {data['metadata']['n_samples']}")

if 'samples_dropped' in data['metadata'] and data['metadata']['samples_dropped']:
    print(f"\nNote: {len(data['metadata']['samples_dropped'])} samples were dropped during QC")


    




# Cell 3: Check Data Before Normalization
df = data['df']
intensity_cols = data['intensity_cols']

# Get all intensity columns
all_cols = []
for cols in intensity_cols.values():
    all_cols.extend(cols)

print("="*60)
print("DATA BEFORE NORMALIZATION")
print("="*60)

print(f"\nIntensity ranges (raw values):")
for condition, cols in intensity_cols.items():
    values = df[cols].values.flatten()
    values = values[~pd.isna(values)]
    print(f"  {condition}:")
    print(f"    Min: {values.min():.1f}")
    print(f"    Max: {values.max():.1f}")
    print(f"    Median: {np.median(values):.1f}")

print(f"\nMissing values: {df[all_cols].isna().sum().sum()} total")


# Cell 4: Run Normalization (Default - Recommended)
# log2 + mindet


data = norm_ip(data, method='log2', imputation='mindet')

print("\n✓ Normalization complete!")
print("\nData automatically saved to: results/data_after_norm.pkl")



# Cell 6: Check Data After Normalization
df = data['df']
intensity_cols = data['intensity_cols']

# Get all intensity columns
all_cols = []
for cols in intensity_cols.values():
    all_cols.extend(cols)

print("="*60)
print("DATA AFTER NORMALIZATION")
print("="*60)

print(f"\nMethod used: {data['normalization']['method']}")
print(f"Imputation used: {data['normalization']['imputation']}")

print(f"\nIntensity ranges (normalized):")
for condition, cols in intensity_cols.items():
    values = df[cols].values.flatten()
    values = values[~pd.isna(values)]
    print(f"  {condition}:")
    print(f"    Min: {values.min():.2f}")
    print(f"    Max: {values.max():.2f}")
    print(f"    Median: {np.median(values):.2f}")
    print(f"    Std: {np.std(values):.2f}")

missing_after = df[all_cols].isna().sum().sum()
print(f"\nMissing values after imputation: {missing_after}")

if missing_after == 0:
    print("✓ All missing values imputed!")
else:
    print(f"⚠ Still {missing_after} missing values")







# Cell 7: Check Normalization Plots
import os

plot_path = '../results/figures/qc/normalization_comparison.pdf'

if os.path.exists(plot_path):
    size_kb = os.path.getsize(plot_path) / 1024
    print(f"✓ Normalization comparison plot created!")
    print(f"  Location: {plot_path}")
    print(f"  Size: {size_kb:.1f} KB")
    print(f"\nOpen this plot to see before/after distributions!")
else:
    print(f"✗ Plot not found: {plot_path}")








# Cell 8: Summary Statistics
print("="*60)
print("NORMALIZATION SUMMARY")
print("="*60)

print(f"\nProteins: {data['metadata']['n_proteins']}")
print(f"Samples: {data['metadata']['n_samples']}")
print(f"Conditions: {data['metadata']['conditions']}")

print(f"\nNormalization applied:")
print(f"  Method: {data['normalization']['method']}")
print(f"  Imputation: {data['normalization']['imputation']}")

print(f"\nData saved to: results/data_after_norm.pkl")
print(f"Plots saved to: results/figures/qc/normalization_comparison.pdf")

print("\n" + "="*60)
print("NEXT STEP: 04_test_stat.ipynb for statistical analysis")
print("="*60)

In [None]:
#### Alternative Normalization Methods (Optional)
#### Uncomment ONE of these to try different methods

#### Z-score normalization with KNN imputation
# data = norm_ip(data, method='zscore', imputation='knn')

#### Quantile normalization with median imputation
# data = norm_ip(data, method='quantile', imputation='median')

#### Median normalization with zero imputation
# data = norm_ip(data, method='median', imputation='zero')



## About the Methods

### Normalization Methods:
- **log2** (default): Log2 transformation - standard for proteomics, reduces dynamic range
- **zscore**: Z-score normalization - mean=0, std=1 per sample
- **quantile**: Quantile normalization - makes distributions identical
- **median**: Median normalization - centers samples on global median

### Imputation Methods:
- **mindet** (default): Minimum detection - imputes with (min - 1.8×std) per sample. Standard for proteomics (assumes missing = low abundance)
- **zero**: Replace with 0 - conservative, treats missing as absent
- **median**: Replace with sample median - middle ground
- **knn**: K-nearest neighbors - uses similar proteins to estimate values

### Recommendation:
**For IP-MS: Use `method='log2', imputation='mindet'`** (the default)

This is the standard in the field and assumes missing values represent low-abundance proteins below detection limit.

---

## Next Steps

After normalization:
1. ✓ Review the normalization_comparison.pdf plot
2. ✓ Check that distributions look reasonable
3. ✓ Proceed to **04_test_stat.ipynb** for statistical analysis