# Feature Engineering for Disease Prediction

This notebook focuses on creating meaningful features from our raw data to improve prediction performance.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
import sys
import os

# Add the src directory to path
sys.path.append(os.path.abspath('../src'))
from data_processing import load_data, identify_column_types

# Visualization settings
plt.style.use('ggplot')
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

## Load the Data

In [None]:
# TODO: Update the path to your actual data file
# data_path = '../data/your_data_file.csv'
# df = load_data(data_path)

# For now, create a placeholder dataframe with more features
n_samples = 100
df = pd.DataFrame({
    'patient_id': range(1, n_samples+1),
    'age': np.random.randint(18, 90, n_samples),
    'gender': np.random.choice(['M', 'F'], n_samples),
    'bmi': np.random.normal(26, 5, n_samples),
    'blood_pressure_systolic': np.random.normal(120, 15, n_samples),
    'blood_pressure_diastolic': np.random.normal(80, 10, n_samples),
    'cholesterol': np.random.normal(200, 30, n_samples),
    'glucose': np.random.normal(100, 20, n_samples),
    'smoking': np.random.choice(['never', 'former', 'current'], n_samples),
    'alcohol': np.random.choice(['none', 'moderate', 'heavy'], n_samples),
    'physical_activity': np.random.choice(['low', 'moderate', 'high'], n_samples),
    'family_history': np.random.choice([0, 1], n_samples),
    'disease_status': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
})

# Display sample data
df.head()

## Basic Feature Engineering

Let's create some new features that might be informative for disease prediction.

In [None]:
# Create a copy of the dataframe for feature engineering
df_features = df.copy()

# 1. BMI Category
def bmi_category(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif bmi < 25:
        return 'normal'
    elif bmi < 30:
        return 'overweight'
    else:
        return 'obese'
    
df_features['bmi_category'] = df_features['bmi'].apply(bmi_category)

# 2. Blood Pressure Category
def bp_category(systolic, diastolic):
    if systolic < 120 and diastolic < 80:
        return 'normal'
    elif (systolic >= 120 and systolic < 130) and diastolic < 80:
        return 'elevated'
    elif (systolic >= 130 and systolic < 140) or (diastolic >= 80 and diastolic < 90):
        return 'stage1'
    else:
        return 'stage2'
    
df_features['bp_category'] = df_features.apply(
    lambda x: bp_category(x['blood_pressure_systolic'], x['blood_pressure_diastolic']), 
    axis=1
)

# 3. Age Groups
df_features['age_group'] = pd.cut(
    df_features['age'], 
    bins=[0, 30, 45, 60, 75, 100], 
    labels=['young_adult', 'adult', 'middle_age', 'senior', 'elderly']
)

# 4. Risk Score (simplified example)
df_features['risk_score'] = (
    (df_features['age'] > 60).astype(int) * 2 +
    (df_features['bmi'] > 30).astype(int) * 2 +
    (df_features['smoking'] == 'current').astype(int) * 3 +
    (df_features['family_history'] == 1).astype(int) * 2 +
    (df_features['blood_pressure_systolic'] > 140).astype(int) * 2
)

# Display the new features
df_features[['age', 'age_group', 'bmi', 'bmi_category', 'bp_category', 'risk_score', 'disease_status']].head(10)

## Encoding Categorical Features

In [None]:
# Identify column types
numeric_columns, categorical_columns = identify_column_types(df_features)

# Remove the target variable and ID
if 'disease_status' in numeric_columns:
    numeric_columns.remove('disease_status')
if 'patient_id' in numeric_columns:
    numeric_columns.remove('patient_id')

print(f"Numeric columns: {numeric_columns}")
print(f"Categorical columns: {categorical_columns}")

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_features, columns=categorical_columns, drop_first=True)

# Display the encoded dataframe
print(f"\nShape after encoding: {df_encoded.shape}")
df_encoded.head()

## Feature Selection

In [None]:
# Separate features and target
X = df_encoded.drop(columns=['patient_id', 'disease_status'])
y = df_encoded['disease_status']

# Select top k features based on ANOVA F-value
k = 10  # Select top 10 features
selector = SelectKBest(f_classif, k=k)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices]

# Display selected features and their scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values('Score', ascending=False)

print("Top 10 features by ANOVA F-value:")
print(feature_scores.head(10))

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Score', y='Feature', data=feature_scores.head(15))
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

## Dimensionality Reduction with PCA (Optional)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=5)  # Reduce to 5 components
X_pca = pca.fit_transform(X_scaled)

# Explained variance
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained variance:", sum(pca.explained_variance_ratio_))

# Plot explained variance
plt.figure(figsize=(10, 6))
plt.bar(range(1, 6), pca.explained_variance_ratio_)
plt.plot(range(1, 6), np.cumsum(pca.explained_variance_ratio_), 'r-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')
plt.show()

## Correlation Analysis

In [None]:
# Select only numeric columns for correlation
numeric_df = df_encoded[numeric_columns + ['disease_status']]

# Calculate correlations
corr_matrix = numeric_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, cmap='coolwarm', annot=True, fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

# Show correlations with target variable
target_corr = corr_matrix['disease_status'].sort_values(ascending=False)
print("Correlations with disease_status:")
print(target_corr)

## Save Processed Features

In [None]:
# Create a dataframe with selected features
df_final = pd.concat([X[selected_features], y], axis=1)

# Save to CSV (commented out until real data is used)
# df_final.to_csv('../data/processed_features.csv', index=False)

print(f"Final dataset shape: {df_final.shape}")
df_final.head()

## Next Steps

1. Apply these feature engineering techniques to the real dataset
2. Explore additional domain-specific features based on medical knowledge
3. Evaluate feature importance in the context of different modeling approaches
4. Proceed to model building and evaluation