# Exploratory Data Analysis

This notebook helps you understand the dataset and the features generated by the pipeline.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set plot style
sns.set_style("whitegrid")

# Define paths
PROCESSED_DIR = Path('../data/processed')

### Load Processed Data

First, make sure you have run the training pipeline (`python main.py --mode train`) to generate these files.

In [None]:
features_df = pd.read_csv(PROCESSED_DIR / 'features.csv')
element_labels_df = pd.read_csv(PROCESSED_DIR / 'element_labels.csv')
molecule_labels_df = pd.read_csv(PROCESSED_DIR / 'molecule_labels.csv')

print("Features shape:", features_df.shape)
print("Element labels shape:", element_labels_df.shape)
print("Molecule labels shape:", molecule_labels_df.shape)

### Inspect the Data

In [None]:
print("Features Head:")
display(features_df.head())

print("\nElement Labels Head:")
display(element_labels_df.head())

print("\nMolecule Labels Head:")
display(molecule_labels_df.head())

### Visualize a Binned Spectrum

In [None]:
# Get the first spectrum's binned features
binned_cols = [col for col in features_df.columns if col.startswith('bin_')]
first_spectrum = features_df[binned_cols].iloc[0]

# We need the m/z values for the x-axis
from src.utils.helpers import set_random_seed
import yaml
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)
mz_bins = config['feature_engineering']['mz_bins']
mz_range = config['feature_engineering']['mz_range']
bin_edges = np.linspace(mz_range[0], mz_range[1], mz_bins + 1)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

plt.figure(figsize=(15, 5))
plt.plot(bin_centers, first_spectrum)
plt.title('Binned Representation of a Mass Spectrum')
plt.xlabel('m/z')
plt.ylabel('Normalized Intensity')
plt.show()

### Analyze Label Distribution

In [None]:
# Element distribution
element_counts = element_labels_df.drop(columns=['spectrum_id']).sum().sort_values(ascending=False)

plt.figure(figsize=(8, 5))
sns.barplot(x=element_counts.index, y=element_counts.values)
plt.title('Frequency of Elements in Spectra')
plt.ylabel('Number of Spectra with Element')
plt.show()

In [None]:
# Molecule (Peptide) distribution
peptide_counts = molecule_labels_df.drop(columns=['spectrum_id']).sum().sort_values(ascending=False)

print(f"Total unique peptides: {len(peptide_counts)}")
print("Top 10 most frequent peptides:")
print(peptide_counts.head(10))