# Advanced Exploratory Data Analysis for Pediatric Appendicitis

This notebook provides an in-depth exploratory data analysis of the pediatric appendicitis dataset, focusing on complex relationships between features, statistical insights, and visualizations that can inform model development.

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import missingno as msno
from statsmodels.graphics.mosaicplot import mosaic
from statsmodels.stats.proportion import proportions_ztest

# Add project root to path
sys.path.append('..')

# Import project modules
from src.data_processing.preprocess import load_data, handle_missing_values, optimize_memory

# Set plot styling
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 14

## 1. Data Loading and Initial Inspection

Let's start by loading the dataset and examining its basic structure, including data types, summary statistics, and missing values.

In [None]:
# Load data
print("Loading data...")
data_path = '../DATA/synthetic_appendicitis_data.csv'

# Load the dataset
df = pd.read_csv(data_path)
print(f"Dataset shape: {df.shape}")

# Display basic information
print("\nDataset Overview:")
df.info()

In [None]:
# Summary statistics
print("\nSummary Statistics:")
df.describe().T

In [None]:
# Check for missing values
print("\nMissing Values:")
df.isnull().sum()

In [None]:
# Visualize missing values
plt.figure(figsize=(10, 6))
msno.matrix(df)
plt.title("Missing Value Matrix")
plt.tight_layout()

## 2. Target Variable Analysis

Now let's examine the distribution of our target variable (appendicitis) to understand the class balance.

In [None]:
# Plot appendicitis distribution
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='Appendicitis', data=df)
ax.bar_label(ax.containers[0])
plt.title('Distribution of Appendicitis Cases')
plt.xlabel('Appendicitis')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.ylabel('Count')
plt.show()

# Calculate class distribution percentages
appendicitis_counts = df['Appendicitis'].value_counts(normalize=True) * 100
print(f"\nClass distribution:")
print(f"Negative (no appendicitis): {appendicitis_counts[0]:.1f}%")
print(f"Positive (appendicitis): {appendicitis_counts[1]:.1f}%")

## 3. Advanced Feature Analysis

Let's perform a detailed analysis of the features, examining their distributions and relationships with the target variable.

### 3.1 Numerical Features Distribution by Outcome

We'll use violin plots to compare the distribution of each numerical feature between positive and negative appendicitis cases.

In [None]:
numerical_features = ['Age', 'Temperature', 'WBC', 'CRP', 'Pain_Duration', 'Neutrophil_Percent']

# Create violin plots for numerical features by appendicitis outcome
fig, axes = plt.subplots(3, 2, figsize=(16, 18))
axes = axes.flatten()

for i, feature in enumerate(numerical_features):
    sns.violinplot(x='Appendicitis', y=feature, data=df, ax=axes[i], inner='quartile')
    axes[i].set_title(f'Distribution of {feature} by Outcome')
    axes[i].set_xlabel('Appendicitis')
    axes[i].set_ylabel(feature)
    axes[i].set_xticklabels(['Negative', 'Positive'])
    
    # Add statistical test results
    neg_vals = df[df['Appendicitis'] == 0][feature].dropna()
    pos_vals = df[df['Appendicitis'] == 1][feature].dropna()
    stat, p_val = stats.mannwhitneyu(neg_vals, pos_vals)
    axes[i].annotate(f'Mann-Whitney U Test\np-value: {p_val:.4f}{"*" if p_val < 0.05 else ""}', 
                    xy=(0.5, 0.95), xycoords='axes fraction', ha='center')

plt.tight_layout()
plt.show()

### 3.2 Feature Correlations and Heatmap

Let's examine the correlations between all numerical features.

In [None]:
plt.figure(figsize=(12, 10))
correlation_matrix = df[numerical_features + ['Appendicitis']].corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', 
            mask=mask, vmin=-1, vmax=1, center=0, linewidths=2)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

### 3.3 Pair Plots of Selected Features

Pair plots provide a comprehensive view of relationships between multiple features.

In [None]:
sns.pairplot(df[numerical_features + ['Appendicitis']], 
             hue='Appendicitis', diag_kind='kde', 
             plot_kws={'alpha': 0.6}, height=2.5)
plt.suptitle('Pair Plot of Numerical Features', y=1.02, fontsize=20)
plt.show()

## 4. Feature Engineering Insights

Let's create and analyze some derived features that might improve model performance.

In [None]:
print("\nCreating derived features...")

# WBC to Neutrophil Ratio
df['WBC_Neutrophil_Ratio'] = df['WBC'] / df['Neutrophil_Percent']

# Pain Duration Categories
df['Pain_Duration_Category'] = pd.cut(df['Pain_Duration'], 
                                      bins=[0, 12, 24, 48, float('inf')],
                                      labels=['<12h', '12-24h', '24-48h', '>48h'])

# Temperature Categories
df['Fever_Category'] = pd.cut(df['Temperature'], 
                              bins=[35, 37.5, 38, 38.5, float('inf')],
                              labels=['Normal', 'Low Fever', 'Moderate Fever', 'High Fever'])

# Analyze derived features
print("\nDistribution of Pain Duration Categories:")
pain_duration_counts = df['Pain_Duration_Category'].value_counts(normalize=True) * 100
print(pain_duration_counts)

In [None]:
# Plot pain duration by outcome
plt.figure(figsize=(12, 6))
contingency = pd.crosstab(df['Pain_Duration_Category'], df['Appendicitis'])
contingency_norm = contingency.div(contingency.sum(axis=1), axis=0) * 100

# Plot stacked bar chart
contingency_norm.plot(kind='bar', stacked=True, colormap='viridis')
plt.title('Appendicitis Rate by Pain Duration')
plt.xlabel('Pain Duration')
plt.ylabel('Percentage')
plt.legend(['Negative', 'Positive'])
plt.show()

### 4.2 Feature Importance Analysis using Decision Trees

Let's use a simple decision tree to identify important features for appendicitis prediction.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df[numerical_features]
y = df['Appendicitis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a simple decision tree classifier
dt = DecisionTreeClassifier(max_depth=4, random_state=42)
dt.fit(X_train, y_train)

# Plot feature importance
plt.figure(figsize=(10, 6))
feature_importance = pd.Series(dt.feature_importances_, index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)
sns.barplot(x=feature_importance, y=feature_importance.index, palette='viridis')
plt.title('Feature Importance for Appendicitis Prediction')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

## 5. Principal Component Analysis (PCA)

Let's apply PCA to understand the underlying structure of our data and potential dimensionality reduction.

In [None]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[numerical_features])

# Apply PCA
pca = PCA()
pca_result = pca.fit_transform(X_scaled)

# Plot explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_.cumsum(), marker='o', linestyle='-')
plt.title('Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

In [None]:
# Plot PCA components
pca_df = pd.DataFrame(data=pca_result[:, 0:2], columns=['PC1', 'PC2'])
pca_df['Appendicitis'] = df['Appendicitis']

plt.figure(figsize=(12, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Appendicitis', data=pca_df, palette='viridis', s=80, alpha=0.8)
plt.title('PCA: First Two Principal Components')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
plt.legend(title='Appendicitis', loc='best', labels=['Negative', 'Positive'])
plt.grid(True)
plt.tight_layout()
plt.show()

## 6. Feature Interaction Analysis

Let's examine interactions between key features to identify important patterns.

### 6.1 WBC and CRP Interaction

In [None]:
plt.figure(figsize=(12, 10))
joint_plot = sns.jointplot(
    x="WBC", y="CRP", data=df, 
    hue="Appendicitis", kind="scatter",
    height=10, ratio=3, marginal_kws=dict(bins=20, alpha=0.6),
    joint_kws=dict(alpha=0.7, s=80)
)
plt.suptitle('Relationship between WBC Count and CRP by Outcome', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

### 6.2 Temperature and Pain Duration

In [None]:
plt.figure(figsize=(12, 10))
joint_plot = sns.jointplot(
    x="Temperature", y="Pain_Duration", data=df, 
    hue="Appendicitis", kind="scatter",
    height=10, ratio=3, marginal_kws=dict(bins=20, alpha=0.6),
    joint_kws=dict(alpha=0.7, s=80)
)
plt.suptitle('Relationship between Temperature and Pain Duration by Outcome', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

### 6.3 Age Distribution Analysis

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Age', hue='Appendicitis', kde=True, bins=20, element="step")
plt.title('Age Distribution by Appendicitis Status')
plt.xlabel('Age (years)')
plt.ylabel('Count')
plt.legend(['Negative', 'Positive'])
plt.show()

### 6.4 Advanced Statistical Tests

In [None]:
# Age vs Appendicitis
age_neg = df[df['Appendicitis'] == 0]['Age']
age_pos = df[df['Appendicitis'] == 1]['Age']
stat, p_val = stats.ttest_ind(age_neg, age_pos, equal_var=False)
print("\nIndependent t-test for Age by Appendicitis Outcome:")
print(f"t-statistic: {stat:.3f}, p-value: {p_val:.4f}")

# WBC vs Appendicitis
wbc_neg = df[df['Appendicitis'] == 0]['WBC']
wbc_pos = df[df['Appendicitis'] == 1]['WBC']
stat, p_val = stats.ttest_ind(wbc_neg, wbc_pos, equal_var=False)
print("\nIndependent t-test for WBC by Appendicitis Outcome:")
print(f"t-statistic: {stat:.3f}, p-value: {p_val:.4f}")

## 7. Key Findings and Insights

Let's summarize the key findings from this exploratory data analysis.

In [None]:
print("\n\n=== Key Findings from Exploratory Data Analysis ===")
print("\n1. Demographics and Clinical Features:")
print("   - Age shows differences between positive and negative appendicitis cases")
print("   - Pain duration has a relationship with appendicitis outcomes")

print("\n2. Laboratory Values:")
print("   - WBC count is significantly higher in appendicitis cases")
print("   - CRP levels show strong correlation with appendicitis")
print("   - Neutrophil percentage differences are observed between outcomes")

print("\n3. Feature Importance:")
print("   - The most predictive features for appendicitis diagnosis (from decision tree):")
for i, (feature, importance) in enumerate(feature_importance.items()[:3]):
    print(f"     {i+1}. {feature}: {importance:.3f}")

print("\n4. Feature Interactions:")
print("   - Combined WBC and CRP values provide stronger discrimination")
print("   - Temperature and pain duration show interesting patterns")

print("\n5. Potential Data Challenges:")
if df.isnull().sum().sum() > 0:
    print("   - Missing values present in the dataset require handling")
else:
    print("   - No missing values in the dataset")

print("\n6. Next Steps:")
print("   - Feature engineering based on clinical knowledge")
print("   - Decision on feature transformations and scaling")
print("   - Model selection and training")