# 01 - Generate Synthetic Clinical Data

This notebook demonstrates:
- Generating realistic synthetic patient data
- Understanding the data structure
- Exploring data quality issues
- Saving data for use in feature store

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_generator import ClinicalDataGenerator

# Configure display
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print("Libraries imported successfully!")

## 1. Generate Synthetic Dataset

We'll create 1000 synthetic patients with clinical and genomic features.

In [None]:
# Initialize generator with fixed seed for reproducibility
generator = ClinicalDataGenerator(seed=42)

# Generate dataset with intentional data quality issues
df = generator.generate_dataset(n_patients=1000, introduce_errors=True)

print(f"Generated dataset with {len(df)} patients and {len(df.columns)} features")
print(f"\nDataset shape: {df.shape}")

## 2. Explore Dataset Structure

In [None]:
# Display first few rows
df.head()

In [None]:
# Data types and info
df.info()

In [None]:
# Summary statistics
df.describe()

## 3. Analyze Missing Data Patterns

Lab values have realistic missing data (~10-15%) due to tests not being ordered for all patients.

In [None]:
# Calculate missing data percentages
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percent': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing_Count', ascending=False)

print("Missing Data Summary:")
print(missing_data[missing_data['Missing_Count'] > 0])

In [None]:
# Visualize missing data
fig, ax = plt.subplots(figsize=(12, 6))
missing_cols = missing_data[missing_data['Missing_Count'] > 0]
ax.barh(missing_cols['Column'], missing_cols['Missing_Percent'])
ax.set_xlabel('Missing Data (%)')
ax.set_title('Missing Data by Column')
ax.axvline(x=15, color='red', linestyle='--', label='15% threshold')
ax.legend()
plt.tight_layout()
plt.show()

## 4. Explore Demographic Features

In [None]:
# Age distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Age histogram
axes[0].hist(df['age'], bins=30, edgecolor='black')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Count')
axes[0].set_title('Age Distribution')
axes[0].axvline(df['age'].mean(), color='red', linestyle='--', label=f'Mean: {df["age"].mean():.1f}')
axes[0].legend()

# Sex distribution
sex_counts = df['sex'].value_counts()
axes[1].bar(sex_counts.index, sex_counts.values)
axes[1].set_xlabel('Sex')
axes[1].set_ylabel('Count')
axes[1].set_title('Sex Distribution (Note: Inconsistent Formatting)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print(f"\nAge statistics: Mean={df['age'].mean():.1f}, Std={df['age'].std():.1f}")
print(f"Sex values (showing formatting inconsistency): {df['sex'].unique()}")

## 5. Explore Genomic Features

In [None]:
# Mutation rates
mutation_cols = ['tp53_mutation', 'kras_mutation', 'egfr_mutation']
mutation_rates = df[mutation_cols].mean() * 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Mutation rates bar chart
axes[0].bar(range(len(mutation_rates)), mutation_rates.values)
axes[0].set_xticks(range(len(mutation_rates)))
axes[0].set_xticklabels(['TP53', 'KRAS', 'EGFR'])
axes[0].set_ylabel('Mutation Rate (%)')
axes[0].set_title('Driver Mutation Rates')
axes[0].set_ylim([0, 100])

# TMB distribution
axes[1].hist(df['tmb_score'], bins=30, edgecolor='black')
axes[1].set_xlabel('TMB Score (mutations/Mb)')
axes[1].set_ylabel('Count')
axes[1].set_title('Tumor Mutational Burden Distribution')
axes[1].axvline(20, color='red', linestyle='--', label='High TMB threshold')
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nMutation Rates:")
for mut, rate in mutation_rates.items():
    print(f"  {mut}: {rate:.1f}%")

## 6. Explore Clinical Outcomes

In [None]:
# Treatment response distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Treatment response
response_counts = df['treatment_response'].value_counts()
axes[0].barh(response_counts.index, response_counts.values)
axes[0].set_xlabel('Count')
axes[0].set_title('Treatment Response Distribution')

# Survival time
axes[1].hist(df['survival_months'], bins=30, edgecolor='black')
axes[1].set_xlabel('Survival (months)')
axes[1].set_ylabel('Count')
axes[1].set_title('Survival Time Distribution')
axes[1].axvline(df['survival_months'].median(), color='red', linestyle='--', 
                label=f'Median: {df["survival_months"].median():.1f} mo')
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"\nMedian survival: {df['survival_months'].median():.1f} months")
print(f"Response rate: {(df['response_status']==1).mean()*100:.1f}%")

## 7. Identify Data Quality Issues

The dataset has intentional quality issues that should be caught by validation.

In [None]:
# Check for invalid ages
invalid_ages = df[(df['age'] < 0) | (df['age'] > 120)]
print(f"Invalid ages found: {len(invalid_ages)}")
if len(invalid_ages) > 0:
    print(invalid_ages[['patient_id', 'age']].head())

# Check for out-of-range TMB scores
invalid_tmb = df[df['tmb_score'] > 100]
print(f"\nInvalid TMB scores (>100): {len(invalid_tmb)}")
if len(invalid_tmb) > 0:
    print(invalid_tmb[['patient_id', 'tmb_score']].head())

# Check for invalid comorbidity counts
invalid_comorb = df[df['comorbidity_count'] > 10]
print(f"\nInvalid comorbidity counts (>10): {len(invalid_comorb)}")
if len(invalid_comorb) > 0:
    print(invalid_comorb[['patient_id', 'comorbidity_count']].head())

## 8. Save Dataset

Save the generated data for use in subsequent notebooks.

In [None]:
output_path = '../data/raw/synthetic_patients.csv'
generator.save_dataset(df, output_path)

print(f"\nDataset saved to: {output_path}")
print("\nReady to proceed to 02_feature_engineering.ipynb!")

In [None]:
## End of Notebook ##