In [3]:
# Phase 3 - Step 1: Feature Selection
import pandas as pd

# Load the cleaned dataset
data = pd.read_csv('/Users/shubhmehta/Desktop/programming/Data Science Projects/Chronic kidney disease EHRs Abu Dhabi/data/ckd_cleaned.csv')

# Features selected based on Phase 2 analysis
selected_features = [
    'eGFRBaseline',
    'CreatinineBaseline',
    'AgeBaseline',
    'HistoryDiabetes',
    'DMmeds',
    'HistoryCHD',
    'ACEIARB'
]

# (Optional Expansion Features - for experimentation later)
optional_features = [
    'HistoryHTN',
    'HTNmeds',
    'DLDmeds',
    'CholesterolBaseline',
    'sBPBaseline'
]

# Target variable
target = ['EventCKD35']

# Create a new DataFrame with selected features + target
final_feature_set = data[selected_features + target]

# Save this dataset for next steps
final_feature_set.to_csv('/Users/shubhmehta/Desktop/programming/Data Science Projects/Chronic kidney disease EHRs Abu Dhabi/data/phase3_features_basic.csv', index=False)

# Also, let's check how the new dataset looks
print("✅ Final Feature Set Preview:")
print(final_feature_set.head())

print("\n✅ Shape of dataset:", final_feature_set.shape)


✅ Final Feature Set Preview:
   eGFRBaseline  CreatinineBaseline  AgeBaseline  HistoryDiabetes  DMmeds  \
0          93.3                59.0           64                0       0   
1         105.8                52.0           52                0       0   
2          99.8                57.0           56                0       0   
3          90.3                65.0           58                0       0   
4          79.7                70.0           63                1       1   

   HistoryCHD  ACEIARB  EventCKD35  
0           0        0           0  
1           0        0           0  
2           0        0           0  
3           0        0           0  
4           0        1           0  

✅ Shape of dataset: (491, 8)


In [4]:
# Phase 3 - Step 2: Create Risk Scores
import pandas as pd

# Load the selected feature dataset
feature_data = pd.read_csv('/Users/shubhmehta/Desktop/programming/Data Science Projects/Chronic kidney disease EHRs Abu Dhabi/data/phase3_features_basic.csv')

# Create a comorbidity score by summing certain categorical features
feature_data['comorbidity_sum'] = (
    feature_data['HistoryDiabetes'] +
    feature_data['DMmeds'] +
    feature_data['HistoryCHD'] +
    feature_data['ACEIARB']
)

# Save this enhanced feature set
feature_data.to_csv('/Users/shubhmehta/Desktop/programming/Data Science Projects/Chronic kidney disease EHRs Abu Dhabi/data/phase3_features_with_riskscore.csv', index=False)

# Preview
print("✅ Feature Set with Comorbidity Sum Preview:")
print(feature_data[['HistoryDiabetes', 'DMmeds', 'HistoryCHD', 'ACEIARB', 'comorbidity_sum']].head())

print("\n✅ Shape after adding comorbidity_sum:", feature_data.shape)


✅ Feature Set with Comorbidity Sum Preview:
   HistoryDiabetes  DMmeds  HistoryCHD  ACEIARB  comorbidity_sum
0                0       0           0        0                0
1                0       0           0        0                0
2                0       0           0        0                0
3                0       0           0        0                0
4                1       1           0        1                3

✅ Shape after adding comorbidity_sum: (491, 9)


In [5]:
# Phase 3 - Step 3: Normalizing Continuous Features
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load dataset with risk score
feature_data = pd.read_csv('/Users/shubhmehta/Desktop/programming/Data Science Projects/Chronic kidney disease EHRs Abu Dhabi/data/phase3_features_with_riskscore.csv')

# Define continuous features to normalize
continuous_features = ['eGFRBaseline', 'CreatinineBaseline', 'AgeBaseline']

# Initialize scaler
scaler = MinMaxScaler()

# Fit and transform continuous features
feature_data[continuous_features] = scaler.fit_transform(feature_data[continuous_features])

# Save normalized dataset
feature_data.to_csv('/Users/shubhmehta/Desktop/programming/Data Science Projects/Chronic kidney disease EHRs Abu Dhabi/data/phase3_features_final.csv', index=False)

# Preview
print("✅ Normalized Feature Set Preview (first 5 rows):")
print(feature_data[continuous_features].head())

print("\n✅ Shape after normalization:", feature_data.shape)

✅ Normalized Feature Set Preview (first 5 rows):
   eGFRBaseline  CreatinineBaseline  AgeBaseline
0      0.182366            0.452991     0.621212
1      0.250821            0.393162     0.439394
2      0.217963            0.435897     0.500000
3      0.165936            0.504274     0.530303
4      0.107886            0.547009     0.606061

✅ Shape after normalization: (491, 9)
