In [1]:
# Experiment 1: Clean, Integrate and Transform EHR Data

# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 2. Load Dataset
DATA_PATH = 'diabetes.csv'  # Replace with your EHR dataset
df = pd.read_csv(DATA_PATH)

# Quick overview
print(df.head())
print(df.info())
print(df.describe())

# 3. Data Cleaning
# Replace zero values in numerical columns with NaN (invalid entries)
columns_to_check = ['Glucose', 'BloodPressure', 'BMI', 'Insulin']
for col in columns_to_check:
    df[col] = df[col].replace(0, np.nan)

# Fill missing values with median
for col in columns_to_check:
    df[col].fillna(df[col].median(), inplace=True)

print(df.isnull().sum())

# 4. Handling Outliers (example: using IQR method)
for col in columns_to_check:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])

# 5. Data Integration (example: merging another dataset)
# df2 = pd.read_csv('additional_patient_data.csv')
# df = pd.merge(df, df2, on='PatientID', how='inner')

# 6. Data Transformation
# Standardize numerical features
numerical_cols = ['Glucose', 'BloodPressure', 'BMI', 'Insulin', 'Age']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Encode categorical variables
if 'Gender' in df.columns:
    le = LabelEncoder()
    df['Gender'] = le.fit_transform(df['Gender'])

# 7. Final Cleaned Dataset
print(df.head())
df.to_csv('ehr_cleaned_transformed.csv', index=False)
print("Dataset cleaned, transformed, and saved successfully!")


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values