In [25]:
import pandas as pd
import numpy as np

# -----------------------------------------
# 1. Load dataset
# -----------------------------------------
df = pd.read_csv("raw/chemotherapy_patient_data.csv")

# -----------------------------------------
# 2. Normalize column names
# -----------------------------------------
df.columns = (
    df.columns.str.strip()
              .str.replace(" ", "_")
              .str.replace("(", "")
              .str.replace(")", "")
              .str.replace("/", "_")
)

# -----------------------------------------
# 3. Standardize categorical fields
# -----------------------------------------
df["Sex"] = df["Sex"].str.strip().str.title()        # male → Male
df["Smoking_Status"] = df["Smoking_Status"].str.title()  # former → Former

# -----------------------------------------
# 4. Convert Yes/No fields to 1/0
# -----------------------------------------
binary_cols = ["Neutropenia", "Metastasis_Status"]
for col in binary_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})

# -----------------------------------------
# 5. Convert Tumor_Stage (I, II, III, IV) → stage I, stage II, stage III, stage IV
# -----------------------------------------
stage_map = {"I": "Stage I", "II":  "Stage II", "III": "Stage III", "IV": "Stage IV"}
df["Tumor_Stage"] = df["Tumor_Stage"].map(stage_map)

# -----------------------------------------
# 6. Clean numeric fields
# -----------------------------------------
df["BMI"] = pd.to_numeric(df["BMI"], errors="coerce")
df["Tumor_Size"] = pd.to_numeric(df["Tumor_Size"], errors="coerce")
df["Dosage_mg_mA2"] = pd.to_numeric(df["Dosage_mg_mA2"], errors="coerce")
df["Cycles_Completed"] = pd.to_numeric(df["Cycles_Completed"], errors="coerce")
df["Overall_Survival_Months"] = pd.to_numeric(df["Overall_Survival_Months"], errors="coerce")


# -----------------------------------------
# 7. Standardize Tumor Response categories
# -----------------------------------------
df["Tumor_Response"] = (
    df["Tumor_Response"]
    .str.strip()
    .str.title()
)  # stable, partial, complete, progressive



# -----------------------------------------
# 9. Create combined phenotype text column (for embeddings)
# -----------------------------------------
df["Phenotype_Text"] = (
    "Cancer: " + df["Cancer_Type"] + "; "
    "Stage " + df["Tumor_Stage"].astype(str) + "; "
    "Nausea=" + df["Nausea_Severity"].astype(str) + "; "
    "Neutropenia=" + df["Neutropenia"].astype(str)
)

# -----------------------------------------
# 10. Export preprocessed file
# -----------------------------------------
df.to_csv("chemotherapy_patient_data_preprocessed.csv", index=False)

df.head()
df = df. drop(columns=["Patient_ID"])

In [26]:
import pandas as pd

# Load dataset
df = pd.read_csv("raw/Disease precaution.csv")

# 1. Strip whitespace
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# 2. Standardize Disease names (title case)
df["Disease"] = df["Disease"].str.title()

# Fix known typos
df["Disease"] = df["Disease"].str.replace("Diseae", "Disease", regex=False)

# Remove parentheses formatting
df["Disease"] = df["Disease"].str.replace(r"[\(\)]", "", regex=True).str.replace("  ", " ")

# 3. Replace missing precautions with "None"
prec_cols = ["Precaution_1", "Precaution_2", "Precaution_3", "Precaution_4"]
df[prec_cols] = df[prec_cols].fillna("None")

# 4. Ensure text in all precaution columns
df[prec_cols] = df[prec_cols].astype(str)

# 5. Create combined text field for embeddings
df["Precautions_Text"] = (
    df["Precaution_1"] + "; " +
    df["Precaution_2"] + "; " +
    df["Precaution_3"] + "; " +
    df["Precaution_4"]
)

# 6. Remove duplicates
df = df.drop_duplicates(subset=["Disease"])

# 7. Lowercase + normalize spacing (optional)
df["Precautions_Text"] = (
    df["Precautions_Text"]
    .str.lower()
    .str.replace("  ", " ")
)
df.head()
# 8. Save cleaned file
df.to_csv("Disease precaution_cleaned.csv", index=False)




  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [27]:
import pandas as pd
import re

# -----------------------------------------------------
# 1. LOAD DATA
# -----------------------------------------------------
df = pd.read_csv("raw/Diseases_Symptoms.csv")

# Standardize column names
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace("-", "_").str.lower()


# -----------------------------------------------------
# 2. CLEANER NAME COLUMN
# -----------------------------------------------------
def clean_text(x):
    if pd.isna(x):
        return ""
    x = x.lower()
    x = x.strip()
    x = re.sub(r"[^a-z0-9\s_]", "", x)
    x = x.replace(" ", "_")
    return x

df["name_clean"] = df["name"].apply(clean_text)


# -----------------------------------------------------
# 3. PROCESS SYMPTOMS LIST
# -----------------------------------------------------
def make_list(x):
    if pd.isna(x):
        return []
    x = x.lower()
    parts = re.split(r",|\band\b", x)
    parts = [re.sub(r"[^a-z0-9\s]", "", p).strip().replace(" ", "_") for p in parts]
    parts = [p for p in parts if p != ""]
    return parts

df["symptoms_list"] = df["symptoms"].apply(make_list)
df["treatments_list"] = df["treatments"].apply(make_list)


# -----------------------------------------------------
# 4. CREATE TEXT COLUMNS
# -----------------------------------------------------
df["symptoms_text"] = df["symptoms_list"].apply(lambda x: " ".join(x))
df["treatments_text"] = df["treatments_list"].apply(lambda x: " ".join(x))


# -----------------------------------------------------
# 5. CREATE FINAL FUSED FULL-TEXT COLUMN
# -----------------------------------------------------
df["full_text"] = (
    df["name_clean"] + " " +
    df["symptoms_text"] + " " +
    df["treatments_text"]
).str.strip()


# -----------------------------------------------------
# 6. DROP UNNEEDED COLUMNS
# -----------------------------------------------------
cols_to_drop = [
    "symptoms",
    "treatments",
    "name_clean"  # name already exists & full_text includes clean version
]

df_final = df.drop(columns=cols_to_drop)


# -----------------------------------------------------
# 7. SAVE CLEAN DATA
# -----------------------------------------------------
df_final.to_csv("Diseases_Symptoms_processed.csv", index=False)

df_final.head()


Unnamed: 0,code,name,symptoms_list,treatments_list,symptoms_text,treatments_text,full_text
0,1,Panic disorder,"[palpitations, sweating, trembling, shortness_...","[antidepressant_medications, cognitive_behavio...",palpitations sweating trembling shortness_of_b...,antidepressant_medications cognitive_behaviora...,panic_disorder palpitations sweating trembling...
1,2,Vocal cord polyp,"[hoarseness, vocal_changes, vocal_fatigue]","[voice_rest, speech_therapy, surgical_removal]",hoarseness vocal_changes vocal_fatigue,voice_rest speech_therapy surgical_removal,vocal_cord_polyp hoarseness vocal_changes voca...
2,3,Turner syndrome,"[short_stature, gonadal_dysgenesis, webbed_nec...","[growth_hormone_therapy, estrogen_replacement_...",short_stature gonadal_dysgenesis webbed_neck l...,growth_hormone_therapy estrogen_replacement_th...,turner_syndrome short_stature gonadal_dysgenes...
3,4,Cryptorchidism,"[absence_or_undescended_testicles, empty_scrot...","[observation, monitoring_in_cases_of_mild_or_t...",absence_or_undescended_testicles empty_scrotum...,observation monitoring_in_cases_of_mild_or_tra...,cryptorchidism absence_or_undescended_testicle...
4,5,Ethylene glycol poisoning-1,"[nausea, vomiting, abdominal_pain, general_mal...","[supportive_measures, gastric_decontamination,...",nausea vomiting abdominal_pain general_malaise...,supportive_measures gastric_decontamination an...,ethylene_glycol_poisoning1 nausea vomiting abd...


In [28]:
import pandas as pd

# Load your file
df = pd.read_csv("raw/Disease_symptom_and_patient_profile_dataset.csv")

# List of symptom columns containing Yes/No
symptom_cols = ["Fever", "Cough", "Fatigue", "Difficulty Breathing"]

# 1️⃣ Convert Yes/No → column name OR "-"
for col in symptom_cols:
    df[col] = df[col].apply(lambda x: col if str(x).strip().lower() == "yes" else "-")

# 2️⃣ Convert Outcome Variable
# Positive  → disease name
# Negative  → "-"
df["Outcome Variable"] = df.apply(
    lambda row: row["Disease"] if str(row["Outcome Variable"]).strip().lower() == "positive" else "-",
    axis=1
)

# 3️⃣ Convert Blood Pressure to BP_low / BP_high / BP_normal
df["Blood Pressure"] = df["Blood Pressure"].apply(
    lambda x: f"{str(x).strip().lower()} Blood Pressure"
)

# 4️⃣ Convert Cholesterol Level → Cholesterol_low / Cholesterol_high / Cholesterol_normal
df["Cholesterol Level"] = df["Cholesterol Level"].apply(
    lambda x: f"{str(x).strip().lower()} Cholesterol"
)

# 5️⃣ Save processed dataset
output_path = "disease_symptoms_cleaned.csv"
df.to_csv(output_path, index=False)

print("✔ Preprocessing complete!")
print("✔ File saved as:", output_path)

df.head()


✔ Preprocessing complete!
✔ File saved as: disease_symptoms_cleaned.csv


Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Fever,-,Fatigue,Difficulty Breathing,19,Female,low Blood Pressure,normal Cholesterol,Influenza
1,Common Cold,-,Cough,Fatigue,-,25,Female,normal Blood Pressure,normal Cholesterol,-
2,Eczema,-,Cough,Fatigue,-,25,Female,normal Blood Pressure,normal Cholesterol,-
3,Asthma,Fever,Cough,-,Difficulty Breathing,25,Male,normal Blood Pressure,normal Cholesterol,Asthma
4,Asthma,Fever,Cough,-,Difficulty Breathing,25,Male,normal Blood Pressure,normal Cholesterol,Asthma


In [29]:
import pandas as pd
import re

df = pd.read_csv("raw/Healthcare.csv")

# ----------------------------------------
# Clean helper
# ----------------------------------------
def clean_token(x):
    x = x.lower().strip()
    x = re.sub(r"[^a-z0-9\s]", "", x)
    x = x.replace(" ", "_")
    return x

def process_symptoms(x):
    parts = re.split(r",", x)
    parts = [clean_token(p) for p in parts]
    parts = [p for p in parts if p != ""]
    parts = list(dict.fromkeys(parts))  # remove duplicates
    return parts

# ----------------------------------------
# Clean columns
# ----------------------------------------
df["gender_clean"] = df["Gender"].str.title()

df["disease_clean"] = df["Disease"].apply(clean_token)

df["symptoms_list"] = df["Symptoms"].apply(process_symptoms)
df["symptoms_text"] = df["symptoms_list"].apply(lambda x: " ".join(x))

# ----------------------------------------
# Final knowledge-base input
# ----------------------------------------
df["full_text"] = (
    df["disease_clean"] + " " +
    df["symptoms_text"] + " " +
    "age_" + df["Age"].astype(str) + " " +
    df["gender_clean"].apply(clean_token)
)
df= df. drop(columns=["Patient_ID"])
# ----------------------------------------
# Save file
# ----------------------------------------
# df.to_csv("processed_patient_symptoms.csv", index=False)
df.head()
df.head()


Unnamed: 0,Age,Gender,Symptoms,Symptom_Count,Disease,gender_clean,disease_clean,symptoms_list,symptoms_text,full_text
0,29,Male,"fever, back pain, shortness of breath",3,Allergy,Male,allergy,"[fever, back_pain, shortness_of_breath]",fever back_pain shortness_of_breath,allergy fever back_pain shortness_of_breath ag...
1,76,Female,"insomnia, back pain, weight loss",3,Thyroid Disorder,Female,thyroid_disorder,"[insomnia, back_pain, weight_loss]",insomnia back_pain weight_loss,thyroid_disorder insomnia back_pain weight_los...
2,78,Male,"sore throat, vomiting, diarrhea",3,Influenza,Male,influenza,"[sore_throat, vomiting, diarrhea]",sore_throat vomiting diarrhea,influenza sore_throat vomiting diarrhea age_78...
3,58,Other,"blurred vision, depression, weight loss, muscl...",4,Stroke,Other,stroke,"[blurred_vision, depression, weight_loss, musc...",blurred_vision depression weight_loss muscle_pain,stroke blurred_vision depression weight_loss m...
4,55,Female,"swelling, appetite loss, nausea",3,Heart Disease,Female,heart_disease,"[swelling, appetite_loss, nausea]",swelling appetite_loss nausea,heart_disease swelling appetite_loss nausea ag...


In [34]:
import pandas as pd

# -----------------------------------------
# 1. Load all six datasets
# -----------------------------------------
card = pd.read_csv("cardiology-sample-2025-11-18.csv")
card = card.drop(columns=["patient_id"])
onco = pd.read_csv("oncology-sample-2025-11-18.csv")
onco = onco.drop(columns=["patient_id"])

chemo = pd.read_csv("chemotherapy_patient_data_preprocessed.csv")


ds1 = pd.read_csv("disease_symptoms_cleaned.csv")


ds2 = pd.read_csv("Diseases_Symptoms_processed.csv")


prec = pd.read_csv("Disease precaution_cleaned.csv")


# -----------------------------------------
# 2. Normalize column names (lowercase only)
# -----------------------------------------
datasets = [card, onco, chemo, ds1, ds2, prec]
for df in datasets:
    df.columns = df.columns.str.lower()

# -----------------------------------------
# 3. Add source label to each dataset
# -----------------------------------------
card["source"] = "cardiology_sample"
onco["source"] = "oncology_sample"
chemo["source"] = "chemotherapy_preprocessed"
ds1["source"] = "disease_symptoms_cleaned"
ds2["source"] = "diseases_symptoms_processed"
prec["source"] = "disease_precaution_cleaned"

# -----------------------------------------
# 4. Merge all datasets into one long table
# -----------------------------------------
unified = pd.concat(
    [card, onco, chemo, ds1, ds2, prec],
    ignore_index=True,
    sort=False
)

# -----------------------------------------
# 5. Remove exact duplicate rows
#    (Only rows where *every* column is 100% identical)
# -----------------------------------------
unified = unified.drop_duplicates(keep="first").reset_index(drop=True)

# -----------------------------------------
# 6. Save output
# -----------------------------------------
output_path = "Unified_Medical_Knowledge.csv"
unified.to_csv(output_path, index=False)

print("Unified dataset saved as:", output_path)


Unified dataset saved as: Unified_Medical_Knowledge.csv


In [35]:
unified

Unnamed: 0,age,gender,chief_complaint,symptoms,risk_factors,ejection_fraction,troponin_level,diagnosis,treatment_plan,estimated_cost,...,symptoms_list,treatments_list,symptoms_text,treatments_text,full_text,precaution_1,precaution_2,precaution_3,precaution_4,precautions_text
0,75.0,M,Syncope,"rapid heart rate, substernal chest pressure, a...","hyperlipidemia, diabetes mellitus, advanced age",51.0,0.00,Ventricular Tachycardia,Heart failure optimization therapy,29738.0,...,,,,,,,,,,
1,67.0,M,Dizziness,"rapid heart rate, nausea, nocturnal dyspnea, i...","obesity, advanced age",49.0,0.03,Ventricular Tachycardia,CABG surgery,5822.0,...,,,,,,,,,,
2,61.0,F,Peripheral edema,"dyspnea on exertion, irregular heartbeat, left...","smoking history, family history of CAD, sedent...",63.0,7.05,Unstable Angina,Heart failure optimization therapy,19466.0,...,,,,,,,,,,
3,41.0,F,Palpitations,"substernal chest pressure, ankle swelling","smoking history, smoking history, family histo...",52.0,6.99,Acute Myocardial Infarction,Heart failure optimization therapy,49976.0,...,,,,,,,,,,
4,55.0,M,Palpitations,"ankle swelling, left arm pain, substernal ches...","obesity, obesity, smoking history, hyperlipidemia",62.0,0.01,Ventricular Tachycardia,Antiarrhythmic medication,40205.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54557,,,,,,,,,,,...,,,,,,call ambulance,chew or swallow asprin,keep calm,,call ambulance; chew or swallow asprin; keep c...
54558,,,,,,,,,,,...,,,,,,consult doctor,medication,rest,follow up,consult doctor; medication; rest; follow up
54559,,,,,,,,,,,...,,,,,,exercise,use hot and cold therapy,try acupuncture,massage,exercise; use hot and cold therapy; try acupun...
54560,,,,,,,,,,,...,,,,,,stop eating solid food for while,try taking small sips of water,rest,ease back into eating,stop eating solid food for while; try taking s...


In [51]:
import pandas as pd

# -----------------------------------------
# 1. Load all six datasets
# -----------------------------------------
card = pd.read_csv("cardiology-sample-2025-11-18.csv")
card = card.drop(columns=["patient_id"])
onco = pd.read_csv("oncology-sample-2025-11-18.csv")
onco = onco.drop(columns=["patient_id"])
# chemo = pd.read_csv("chemotherapy_patient_data_preprocessed.csv")
ds1 = pd.read_csv("disease_symptoms_cleaned.csv")
ds2 = pd.read_csv("Diseases_Symptoms_processed.csv")
# prec = pd.read_csv("Disease precaution_cleaned.csv")

# -----------------------------------------
# 2. Normalize column names (lowercase only)
# -----------------------------------------
datasets = [card, onco,  ds1]
for df in datasets:
    df.columns = df.columns.str.lower()

# -----------------------------------------
# 3. Add source label to each dataset
# -----------------------------------------
card["source"] = "cardiology_sample"
onco["source"] = "oncology_sample"
# chemo["source"] = "chemotherapy_preprocessed"
ds1["source"] = "disease_symptoms_cleaned"
ds2["source"] = "diseases_symptoms_processed"
# prec["source"] = "disease_precaution_cleaned"

# -----------------------------------------
# 4. Merge all datasets into one long table
# -----------------------------------------
unified = pd.concat(
    [card, onco,  ds1 , ds2],
    # ignore_index=True,
    sort=False
)

# -----------------------------------------
# 5. Remove exact duplicate rows
#    (Only rows where *every* column is 100% identical)
# -----------------------------------------
unified = unified.drop_duplicates(keep="first").reset_index(drop=True)

# -----------------------------------------
# 6. Save output
# -----------------------------------------
output_path = "Unified_Medical_Knowledge.csv"
unified.to_csv(output_path, index=False)

print("Unified dataset saved as:", output_path)


Unified dataset saved as: Unified_Medical_Knowledge.csv


In [52]:
unified

Unnamed: 0,age,gender,chief_complaint,symptoms,risk_factors,ejection_fraction,troponin_level,diagnosis,treatment_plan,estimated_cost,...,blood pressure,cholesterol level,outcome variable,code,name,symptoms_list,treatments_list,symptoms_text,treatments_text,full_text
0,75.0,M,Syncope,"rapid heart rate, substernal chest pressure, a...","hyperlipidemia, diabetes mellitus, advanced age",51.0,0.00,Ventricular Tachycardia,Heart failure optimization therapy,29738.0,...,,,,,,,,,,
1,67.0,M,Dizziness,"rapid heart rate, nausea, nocturnal dyspnea, i...","obesity, advanced age",49.0,0.03,Ventricular Tachycardia,CABG surgery,5822.0,...,,,,,,,,,,
2,61.0,F,Peripheral edema,"dyspnea on exertion, irregular heartbeat, left...","smoking history, family history of CAD, sedent...",63.0,7.05,Unstable Angina,Heart failure optimization therapy,19466.0,...,,,,,,,,,,
3,41.0,F,Palpitations,"substernal chest pressure, ankle swelling","smoking history, smoking history, family histo...",52.0,6.99,Acute Myocardial Infarction,Heart failure optimization therapy,49976.0,...,,,,,,,,,,
4,55.0,M,Palpitations,"ankle swelling, left arm pain, substernal ches...","obesity, obesity, smoking history, hyperlipidemia",62.0,0.01,Ventricular Tachycardia,Antiarrhythmic medication,40205.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,,,,,,,,,,,...,,,,396.0,Urinary Stones (Kidney Stones),"['severe_abdominal_or_back_pain', 'blood_in_ur...","['pain_management', 'increased_fluid_intake', ...",severe_abdominal_or_back_pain blood_in_urine f...,pain_management increased_fluid_intake medicat...,urinary_stones_kidney_stones severe_abdominal_...
2196,,,,,,,,,,,...,,,,397.0,Osteoporosis,"['fragile_bones', 'loss_of_height_over_time', ...","['calcium', 'vitamin_d_supplements', 'regular_...",fragile_bones loss_of_height_over_time back_pa...,calcium vitamin_d_supplements regular_exercise...,osteoporosis fragile_bones loss_of_height_over...
2197,,,,,,,,,,,...,,,,398.0,Rheumatoid Arthritis,"['joint_pain', 'stiffness', 'swelling', 'fatig...",['medications_nonsteroidal_antiinflammatory_dr...,joint_pain stiffness swelling fatigue loss_of_...,medications_nonsteroidal_antiinflammatory_drug...,rheumatoid_arthritis joint_pain stiffness swel...
2198,,,,,,,,,,,...,,,,399.0,Type 1 Diabetes,"['frequent_urination', 'increased_thirst', 'we...","['insulin_therapy', 'blood_sugar_monitoring', ...",frequent_urination increased_thirst weight_loss,insulin_therapy blood_sugar_monitoring healthy...,type_1_diabetes frequent_urination increased_t...
