# Exploratory Data Analysis - Lung Cancer Detection

This notebook performs EDA on chest X-ray datasets for lung cancer detection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2

from src.config import config

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Dataset Metadata

In [None]:
# Load train/val/test splits
train_df = pd.read_csv(config.data.splits_dir / 'train.csv')
val_df = pd.read_csv(config.data.splits_dir / 'val.csv')
test_df = pd.read_csv(config.data.splits_dir / 'test.csv')

print(f"Train: {len(train_df)} samples")
print(f"Val: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")

train_df.head()

## 2. Class Distribution

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (name, df) in zip(axes, [('Train', train_df), ('Val', val_df), ('Test', test_df)]):
    counts = df['label'].value_counts()
    ax.bar(['Negative', 'Positive'], counts.values)
    ax.set_title(f'{name} Set')
    ax.set_ylabel('Count')
    
    # Add percentages
    for i, v in enumerate(counts.values):
        ax.text(i, v, f'{v}\n({v/len(df)*100:.1f}%)', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 3. Patient-Level Analysis

In [None]:
# Check patient overlap (should be zero)
train_patients = set(train_df['patient_id'].unique())
val_patients = set(val_df['patient_id'].unique())
test_patients = set(test_df['patient_id'].unique())

print(f"Unique patients - Train: {len(train_patients)}, Val: {len(val_patients)}, Test: {len(test_patients)}")
print(f"Train-Val overlap: {len(train_patients & val_patients)}")
print(f"Train-Test overlap: {len(train_patients & test_patients)}")
print(f"Val-Test overlap: {len(val_patients & test_patients)}")

assert len(train_patients & val_patients) == 0, "LEAKAGE DETECTED!"
assert len(train_patients & test_patients) == 0, "LEAKAGE DETECTED!"
assert len(val_patients & test_patients) == 0, "LEAKAGE DETECTED!"

print("\nâœ“ No patient leakage detected")

## 4. Sample Images Visualization

In [None]:
# Visualize sample images
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

# Get samples
positive_samples = train_df[train_df['label'] == 1].sample(4)
negative_samples = train_df[train_df['label'] == 0].sample(4)

for i, (idx, row) in enumerate(positive_samples.iterrows()):
    img = cv2.imread(row['image_path'])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    axes[0, i].imshow(img)
    axes[0, i].set_title(f"Positive - {row['patient_id']}")
    axes[0, i].axis('off')

for i, (idx, row) in enumerate(negative_samples.iterrows()):
    img = cv2.imread(row['image_path'])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    axes[1, i].imshow(img)
    axes[1, i].set_title(f"Negative - {row['patient_id']}")
    axes[1, i].axis('off')

plt.tight_layout()
plt.show()

## 5. Metadata Analysis (if available)

In [None]:
# Analyze clinical metadata if available
if 'age' in train_df.columns:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    # Age distribution
    train_df.boxplot(column='age', by='label', ax=axes[0])
    axes[0].set_title('Age Distribution by Label')
    
    # Smoking history
    if 'smoking_pack_years' in train_df.columns:
        train_df.boxplot(column='smoking_pack_years', by='label', ax=axes[1])
        axes[1].set_title('Smoking Pack-Years by Label')
    
    # Symptom score
    if 'symptom_score' in train_df.columns:
        train_df.boxplot(column='symptom_score', by='label', ax=axes[2])
        axes[2].set_title('Symptom Score by Label')
    
    plt.tight_layout()
    plt.show()

## 6. Summary Statistics

In [None]:
print("Dataset Summary:")
print("=" * 60)
print(f"Total samples: {len(train_df) + len(val_df) + len(test_df)}")
print(f"Total patients: {len(train_patients) + len(val_patients) + len(test_patients)}")
print(f"\nPositive rate:")
print(f"  Train: {train_df['label'].mean():.3f}")
print(f"  Val: {val_df['label'].mean():.3f}")
print(f"  Test: {test_df['label'].mean():.3f}")
print("=" * 60)