# COVID-19 Classification Dataset Preparation

This notebook prepares a master dataset for the COVID-19 classification stage of our pipeline, combining data from:

1. **CDC COVID-19 Case Surveillance Data**: For confirmed COVID-19 cases
2. **MIMIC-IV Clinical Data**: For hospital and ICU data

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Make sure processed directory exists
os.makedirs('../data/processed', exist_ok=True)

## 1. Load CDC Data

First, let's load the CDC COVID-19 case surveillance data.

In [None]:
# File path
cdc_file = '../data/external/covid19_case_surveillance.csv'

# Check if file exists
if os.path.exists(cdc_file):
    # Load data, handling the issue with space in column name
    cdc_df = pd.read_csv(cdc_file, skipinitialspace=True)
    
    # Strip whitespace from column names to be safe
    cdc_df.columns = cdc_df.columns.str.strip()
    
    print(f"Loaded {len(cdc_df)} CDC records")
    cdc_df.head()

## 2. Check CDC Data Structure

In [None]:
# Display basic info
print("CDC data columns:")
for col in cdc_df.columns:
    print(f"- {col}")

# Check COVID status distribution
print("\nCOVID-19 Status Distribution:")
display(cdc_df['current_status'].value_counts())

# Basic visualizations
plt.figure(figsize=(10, 6))
sns.countplot(x='current_status', data=cdc_df)
plt.title('COVID-19 Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Load MIMIC Data

Now let's load MIMIC-IV data.

In [None]:
# File paths
mimic_dir = '../data/external/mimic'
patients_file = os.path.join(mimic_dir, 'patients_sample.csv')
diagnoses_file = os.path.join(mimic_dir, 'relevant_diagnoses.csv')
icd_file = os.path.join(mimic_dir, 'd_icd_diagnoses.csv')
labs_file = os.path.join(mimic_dir, 'relevant_labevents.csv')

# Load patient data
mimic_patients = None
if os.path.exists(patients_file):
    mimic_patients = pd.read_csv(patients_file)
    print(f"Loaded {len(mimic_patients)} MIMIC patient records")
    display(mimic_patients.head())

# Load diagnoses data
mimic_diagnoses = None
if os.path.exists(diagnoses_file):
    mimic_diagnoses = pd.read_csv(diagnoses_file)
    print(f"Loaded {len(mimic_diagnoses)} MIMIC diagnosis records")
    display(mimic_diagnoses.head())

# Load ICD codes dictionary
mimic_icd = None
if os.path.exists(icd_file):
    mimic_icd = pd.read_csv(icd_file)
    print(f"Loaded {len(mimic_icd)} ICD code descriptions")
    display(mimic_icd.head())

# Load lab results
mimic_labs = None
if os.path.exists(labs_file):
    mimic_labs = pd.read_csv(labs_file)
    print(f"Loaded {len(mimic_labs)} lab result records")
    display(mimic_labs.head())

## 4. Prepare CDC Data for Classification

Let's process the CDC data into a format suitable for classification.

In [None]:
def prepare_cdc_data(df):
    """
    Process CDC data for classification.
    """
    if df is None:
        return None
    
    # Create a copy to avoid modifying the original
    result = df.copy()
    
    # Create target variable (1 for confirmed case, 0 for probable or missing)
    result['covid_positive'] = result['current_status'].apply(
        lambda x: 1 if x and 'confirmed' in str(x).lower() else 0
    )
    
    # Handle missing values
    for col in ['sex', 'age_group', 'hosp_yn', 'icu_yn', 'death_yn', 'medcond_yn']:
        if col in result.columns:
            result[col] = result[col].replace('Missing', np.nan)
    
    # Convert Yes/No columns to 1/0
    for col in ['hosp_yn', 'icu_yn', 'death_yn', 'medcond_yn']:
        if col in result.columns:
            result[col] = result[col].map({'Yes': 1, 'No': 0, 'Unknown': np.nan})
    
    # Create dummy variables for categorical columns
    if 'sex' in result.columns:
        sex_dummies = pd.get_dummies(result['sex'], prefix='sex')
        result = pd.concat([result, sex_dummies], axis=1)
    
    if 'age_group' in result.columns:
        age_dummies = pd.get_dummies(result['age_group'], prefix='age')
        result = pd.concat([result, age_dummies], axis=1)
    
    # Keep relevant columns only
    cols_to_keep = ['covid_positive', 'hosp_yn', 'icu_yn', 'death_yn', 'medcond_yn']
    
    # Add dummy columns
    cols_to_keep.extend([col for col in result.columns if col.startswith('sex_')])
    cols_to_keep.extend([col for col in result.columns if col.startswith('age_')])
    
    # Filter to columns that exist
    cols_to_keep = [col for col in cols_to_keep if col in result.columns]
    
    # Add record ID
    result['record_id'] = ['CDC_' + str(i) for i in range(len(result))]
    cols_to_keep.insert(0, 'record_id')  # Add to beginning
    
    # Return dataset with selected columns
    return result[cols_to_keep]

# Process CDC data
cdc_processed = prepare_cdc_data(cdc_df)
print(f"Processed CDC data: {len(cdc_processed)} records with {len(cdc_processed.columns)} features")
cdc_processed.head()

## 5. Prepare MIMIC Data for Classification

Now let's process the MIMIC data.

In [None]:
def prepare_mimic_data(patients, diagnoses, icd_codes, labs):
    """
    Process MIMIC data for classification.
    """
    if patients is None:
        return None
    
    # Start with patient data
    result = patients.copy()
    
    # Add COVID-19 flag based on diagnoses if available
    if diagnoses is not None and icd_codes is not None:
        # Merge diagnoses with ICD codes
        merged = pd.merge(diagnoses, icd_codes, on='icd_code', how='left')
        
        # Find COVID-related diagnoses
        covid_codes = merged[merged['long_title'].str.contains('COVID|coronavirus|SARS-CoV', 
                                                         case=False, na=False)]
        
        # Get patients with COVID diagnoses
        covid_patients = covid_codes['subject_id'].unique()
        
        # Add flag to result
        result['covid_positive'] = result['subject_id'].isin(covid_patients).astype(int)
        print(f"Found {len(covid_patients)} patients with COVID-19 diagnoses")
    else:
        # If no diagnoses data, assume all negative
        result['covid_positive'] = 0
        print("No diagnoses data available, assuming all patients are COVID-19 negative")
    
    # Add lab data if available
    if labs is not None:
        try:
            # Create pivot table with lab results
            lab_pivot = labs.pivot_table(
                index='subject_id',
                columns='itemid',
                values='valuenum',
                aggfunc='mean'
            )
            
            # Rename columns
            lab_pivot.columns = [f'lab_{col}' for col in lab_pivot.columns]
            lab_pivot.reset_index(inplace=True)
            
            # Merge with patient data
            result = pd.merge(result, lab_pivot, on='subject_id', how='left')
            print(f"Added {len(lab_pivot.columns)-1} lab features")
        except Exception as e:
            print(f"Error adding lab data: {e}")
    
    # Process demographic features
    if 'gender' in result.columns:
        # Create dummy variables
        gender_dummies = pd.get_dummies(result['gender'], prefix='gender')
        result = pd.concat([result, gender_dummies], axis=1)
    
    # Create record ID
    result['record_id'] = ['MIMIC_' + str(id) for id in result['subject_id']]
    
    # Select columns to keep
    cols_to_keep = ['record_id', 'covid_positive']
    
    # Add demographic columns
    gender_cols = [col for col in result.columns if col.startswith('gender_')]
    cols_to_keep.extend(gender_cols)
    
    # Add lab columns
    lab_cols = [col for col in result.columns if col.startswith('lab_')]
    cols_to_keep.extend(lab_cols)
    
    # Select columns that exist
    cols_to_keep = [col for col in cols_to_keep if col in result.columns]
    
    return result[cols_to_keep]

# Process MIMIC data if available
mimic_processed = None
if mimic_patients is not None:
    mimic_processed = prepare_mimic_data(mimic_patients, mimic_diagnoses, mimic_icd, mimic_labs)
    print(f"Processed MIMIC data: {len(mimic_processed)} records with {len(mimic_processed.columns)} features")
    display(mimic_processed.head())
else:
    print("No MIMIC patient data available")

## 6. Create Master Classification Dataset

Now let's combine both datasets into a master classification dataset.

In [None]:
# List of datasets to combine
datasets = []

if cdc_processed is not None:
    datasets.append(cdc_processed)
    print(f"Adding {len(cdc_processed)} CDC records")

if mimic_processed is not None:
    datasets.append(mimic_processed)
    print(f"Adding {len(mimic_processed)} MIMIC records")

# Combine datasets
if datasets:
    master_df = pd.concat(datasets, axis=0, ignore_index=True)
    print(f"Created master dataset with {len(master_df)} records and {len(master_df.columns)} features")
    
    # Display the first few rows
    display(master_df.head())
    
    # Save to file
    master_file = '../data/processed/covid_classification_dataset.csv'
    master_df.to_csv(master_file, index=False)
    print(f"Saved master dataset to {master_file}")
else:
    print("No data available to create master dataset")

## 7. Analyze Master Dataset

Let's examine the features and target distribution in our master dataset.

In [None]:
if 'master_df' in locals() and master_df is not None:
    # Target distribution
    print("COVID-19 Status Distribution:")
    covid_dist = master_df['covid_positive'].value_counts()
    display(covid_dist)
    
    # Visualize
    plt.figure(figsize=(8, 6))
    sns.countplot(x='covid_positive', data=master_df)
    plt.title('COVID-19 Positive vs Negative Cases')
    plt.xlabel('COVID-19 Status (1=Positive, 0=Negative)')
    plt.show()
    
    # Check missing values
    missing = master_df.isnull().sum()
    missing = missing[missing > 0]
    
    if not missing.empty:
        print("\nMissing values:")
        display(missing)
        
        # Plot missing values percentages
        plt.figure(figsize=(10, 6))
        missing_pct = (missing / len(master_df) * 100).sort_values(ascending=False)
        sns.barplot(x=missing_pct.values, y=missing_pct.index)
        plt.title('Percentage of Missing Values')
        plt.xlabel('% Missing')
        plt.tight_layout()
        plt.show()
    else:
        print("\nNo missing values in the dataset!")
    
    # Check numeric correlations with target
    numeric_cols = master_df.select_dtypes(include=['int64', 'float64']).columns
    if len(numeric_cols) > 1:
        corr = master_df[numeric_cols].corr()['covid_positive'].sort_values(ascending=False)
        print("\nFeature correlations with COVID-19 status:")
        display(corr)
        
        # Plot correlations
        plt.figure(figsize=(10, 8))
        corr = corr[corr.index != 'covid_positive']  # Remove self-correlation
        top_corr = corr.head(15)  # Show top 15
        sns.barplot(x=top_corr.values, y=top_corr.index)
        plt.title('Top Feature Correlations with COVID-19')
        plt.tight_layout()
        plt.show()
    
    # Check class balance
    class_balance = master_df['covid_positive'].value_counts(normalize=True) * 100
    print(f"\nClass balance: {class_balance[1]:.1f}% positive, {class_balance[0]:.1f}% negative")
    
    # Suggest balance handling if needed
    if abs(class_balance[0] - class_balance[1]) > 20:
        print("\nNote: The dataset is imbalanced. Consider using class weights, sampling techniques,")
        print("or specialized metrics when training classification models.")
else:
    print("No master dataset available for analysis")

## 8. Basic Model Example (Optional) 

Here's a simple model demonstration, but only run this if you have scikit-learn installed.

In [None]:
try:
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline
    
    if 'master_df' in locals() and master_df is not None:
        # Prepare features and target
        X = master_df.drop(['record_id', 'covid_positive'], axis=1)
        y = master_df['covid_positive']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Create pipeline with imputer and model
        pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
        ])
        
        # Train model
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Evaluate
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()
        
        # Feature importance
        if hasattr(pipeline.named_steps['classifier'], 'feature_importances_'):
            importances = pipeline.named_steps['classifier'].feature_importances_
            features = X_train.columns
            
            # Create DataFrame
            importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
            importance_df = importance_df.sort_values('Importance', ascending=False)
            
            # Display top features
            print("\nTop 10 Important Features:")
            display(importance_df.head(10))
            
            # Plot
            plt.figure(figsize=(10, 6))
            sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
            plt.title('Feature Importance')
            plt.tight_layout()
            plt.show()
            
            # Save feature importance
            importance_df.to_csv('../data/processed/feature_importance.csv', index=False)
            print(f"Saved feature importance to ../data/processed/feature_importance.csv")
    else:
        print("No master dataset available for modeling")
        
except ImportError:
    print("scikit-learn not installed. Skip model training.")
    print("Install with: pip install scikit-learn")

## 9. Summary and Next Steps

In this notebook, we:
1. Loaded and processed CDC Case Surveillance data
2. Loaded and processed MIMIC-IV clinical data
3. Combined them into a master classification dataset
4. Analyzed features and relationships with COVID-19 status
5. Created a simple demonstration model (if scikit-learn was available)

The master dataset is now saved at `../data/processed/covid_classification_dataset.csv` and is ready for further model development.

Next steps could include:
- Feature engineering to create more predictive variables
- Hyperparameter tuning for classification models
- Integration with the NER pipeline to incorporate text-derived features
- Deployment of the model in a production pipeline