# Sadaa Vijayee Project 1

### Data Cleaning

In [1]:
import pandas as pd

In [2]:
hospital = pd.read_csv("rawDataset/healthcare_dataset.csv")
cardio = pd.read_csv("rawDataset/cardio_dataset.csv", sep=";")

In [3]:
cardio["Age_d"] = (cardio["age"] // 365).astype(int)

cardio["Gender_d"] = cardio["gender"].map({1: "Male", 2: "Female"})

cardio_clean = cardio[
    ["Age_d", "Gender_d", "height", "weight", "ap_hi", "ap_lo",
     "cholesterol", "gluc", "smoke", "alco", "active", "cardio"]
]

In [4]:
hospital_male = hospital[hospital["Gender"] == "Male"].sort_values("Age").reset_index(drop=True)
hospital_female = hospital[hospital["Gender"] == "Female"].sort_values("Age").reset_index(drop=True)

cardio_male = cardio_clean[cardio_clean["Gender_d"] == "Male"].sort_values("Age_d").reset_index(drop=True)
cardio_female = cardio_clean[cardio_clean["Gender_d"] == "Female"].sort_values("Age_d").reset_index(drop=True)


In [5]:

cardio_male = cardio_male.iloc[:len(hospital_male)]
cardio_female = cardio_female.iloc[:len(hospital_female)]

merged_male = pd.concat([hospital_male, cardio_male], axis=1)
merged_female = pd.concat([hospital_female, cardio_female], axis=1)
merged_final = pd.concat([merged_male, merged_female], axis=0).reset_index(drop=True)


In [6]:
merged_final.to_csv("merged_healthcare_dataset.csv", index=False)

print("Perfect merged file created successfully!")
print("Final shape:", merged_final.shape)

Perfect merged file created successfully!
Final shape: (55500, 27)


In [7]:
df = pd.read_csv("merged_healthcare_dataset.csv")

df = df.drop(columns=["Gender_d", "Age_d","Medication"])

df = df.iloc[:52244]

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

rename_dict = {
    "Blood Type": "Blood_Type",
    "Medical Condition": "Medical_Condition",
    "Date of Admission": "Date_of_Admission",
    "Insurance Provider": "Insurance_Provider",
    "Billing Amount": "Billing_Amount",
    "Room Number": "Room_Number",
    "Admission Type": "Admission_Type",
    "Discharge Date": "Discharge_Date",
    "Test Results": "Test_Results",
    "height": "Height",
    "weight": "Weight",
    "cholesterol": "Cholesterol",
    "smoke": "Smoke",
    "ap_hi": "Systolic_BP",
    "ap_lo": "Diastolic_BP",
    "gluc": "Glucose",
    "alco": "Alcohol_Intake",
    "active": "Physically_Active",
    "cardio": "Heart_Disease_Risk"
}
df = df.rename(columns=rename_dict)


In [8]:

binary_map = {0: "No", 1: "Yes"}

df["Smoke"] = df["Smoke"].map(binary_map)
df["Alcohol_Intake"] = df["Alcohol_Intake"].map(binary_map)
df["Physically_Active"] = df["Physically_Active"].map(binary_map)
df["Heart_Disease_Risk"] = df["Heart_Disease_Risk"].map(binary_map)

level_map = {
    1: "Normal",
    2: "Moderate",
    3: "Severe"
}

df["Cholesterol"] = df["Cholesterol"].map(level_map)
df["Glucose"] = df["Glucose"].map(level_map)

In [9]:
df["Billing_Amount"] = df["Billing_Amount"].apply(lambda x: f"{x:.2f}")
df["Billing_Amount"] = pd.to_numeric(df["Billing_Amount"], errors="coerce")
df["Name"] = df["Name"].str.title()


df["Date_of_Admission"] = pd.to_datetime(df["Date_of_Admission"], errors="coerce").dt.strftime("%Y-%m-%d")
df["Discharge_Date"] = pd.to_datetime(df["Discharge_Date"], errors="coerce").dt.strftime("%Y-%m-%d")

if 'Height' in df.columns and 'Weight' in df.columns:
    df['BMI'] = (df['Weight'] / ((df['Height']/100)**2)).apply(lambda x: f"{x:.2f}")
df["BMI"] = pd.to_numeric(df["BMI"], errors="coerce")


In [10]:
'''{
  Smoking: (Yes = 3, No = 0),
  Alcohol_Intake: (Yes = 2, No = 0),
  Physically_Active: (Yes = 0, No = 2),

  Cholesterol: (Normal = 0, Moderate = 1, Severe = 2),
  Glucose: (Normal = 0, Moderate = 1, Severe = 2),

  BMI: (
        <18.5 = 1,
        18.5-24.9 = 0,
        25-29.9 = 1,
        â‰¥30 = 2
      )
}'''
df['smoke_score'] = df['Smoke'].map({'Yes': 3, 'No': 0})

df['alcohol_score'] = df['Alcohol_Intake'].map({'Yes': 2, 'No': 0})

df['active_score'] = df['Physically_Active'].map({'Yes': 0, 'No': 2})

def bmi_score(bmi):
    if pd.isna(bmi):
        return 0
    if bmi < 18.5:
        return 1
    elif bmi < 25:
        return 0
    elif bmi < 30:
        return 1
    else:
        return 2

df['bmi_score'] = df['BMI'].apply(bmi_score)

level_score = {'Normal': 0, 'Moderate': 1, 'Severe': 2}

df['cholesterol_score'] = df['Cholesterol'].map(level_score)
df['glucose_score'] = df['Glucose'].map(level_score)

df['Lifestyle_Index'] = (
    (df['smoke_score'] +
    df['alcohol_score'] +
    df['active_score'] +
    df['bmi_score'] +
    df['cholesterol_score'] +
    df['glucose_score'])
)

print(df[['BMI','Lifestyle_Index']].head())
def bmi_category(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

df["BMI_Category"] = df["BMI"].apply(bmi_category)
df = df.drop(columns=[
    "smoke_score",
    "alcohol_score",
    "active_score",
    "bmi_score",
    "cholesterol_score",
    "glucose_score"
])
ordered_cols = [
    "Name","Age","Gender",
    "Blood_Type","Medical_Condition",
    "Date_of_Admission","Discharge_Date","Admission_Type","Room_Number",
    "Doctor","Hospital","Insurance_Provider","Billing_Amount",
    "Test_Results",
    "Height","Weight","BMI","BMI_Category",
    "Systolic_BP","Diastolic_BP","Cholesterol","Glucose",
    "Smoke","Alcohol_Intake","Physically_Active","Lifestyle_Index",
    "Heart_Disease_Risk"
]

df = df[ordered_cols]



     BMI  Lifestyle_Index
0  24.61                0
1  30.37                5
2  24.00                3
3  22.68                0
4  26.89                2


In [11]:
df = df[(df['Systolic_BP'] >= 70) & (df['Systolic_BP'] <= 250)]
df = df[(df['Diastolic_BP'] >= 40) & (df['Diastolic_BP'] <= 150)]

df = df[(df['Height'] >= 120) & (df['Height'] <= 220)]
df = df[(df['Weight'] >= 30) & (df['Weight'] <= 250)]
df = df[(df['Billing_Amount'] >0)]

df = df[(df['BMI'] >= 10) & (df['BMI'] <= 60)]

df['Lifestyle_Index'] = df['Lifestyle_Index'].clip(0, 12)


In [12]:
df.to_csv("cleaned_healthcare_dataset.csv", index=False)
print("Final shape after cleaning:", df.shape)
print(df.head())
print(df.describe())

Final shape after cleaning: (51173, 27)
               Name  Age  Gender Blood_Type Medical_Condition  \
0    Robert Nichols   61    Male         O+            Cancer   
1    William Zavala   53  Female         A+           Obesity   
2  Danielle Gregory   57  Female         A+            Asthma   
3     Michael Smith   17  Female         A-           Obesity   
4  Kimberly Bridges   68  Female         B+      Hypertension   

  Date_of_Admission Discharge_Date Admission_Type  Room_Number  \
0        2019-05-18     2019-05-22      Emergency          353   
1        2021-01-04     2021-02-03      Emergency          369   
2        2024-04-21     2024-04-24       Elective          353   
3        2020-07-17     2020-07-31         Urgent          128   
4        2019-07-31     2019-08-24      Emergency          366   

            Doctor  ... BMI_Category Systolic_BP  Diastolic_BP Cholesterol  \
0        Aaron Fox  ...       Normal       120.0          80.0      Normal   
1       Mark All