In [None]:
# We begin by importing all the necessary libraries


import pandas as pd
import re
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from IPython.display import display



In [31]:
# I made a custom dataset file with missing values called raw_medical_notes.txt 
# Load the dataset from file

with open("raw_medical_notes.txt" , 'r') as file:
    raw_medical_notes = json.load(file)

df = pd.DataFrame(raw_medical_notes)

display(df)


Unnamed: 0,patient_id,name,dob,age,gender,symptoms,diagnosis,medications,timestamp,doctor,email,phone,notes,address
0,1,Alice Johnson,1992-03-14,Thirty-Two,FEMALE,"[headache, Nausea, None, Blurred Vision]",Migraine,"[sumatriptan, ibuprofen , None, Naproxen]",2024/02/09 15:45,Dr. Smith,alicej@example.com,+1 (555) 987-6543,Patient reported severe pain in head and sensi...,
1,2,Bob Williams,1987-07-29,37,male,"fever, cough, body ache",,"[ Tylenol, amoxicillin , ibuprofen]",09-02-2024 9:30 AM,Dr. Adams,,,Patient has persistent fever for the past 3 da...,"123 Maple St, Toronto, ON"
2,3,Charlie O'Connor,05-15-1995,29,M,"[fatigue, joint pain, low appetite]",Rheumatoid arthritis,"[methotrexate, prednisone, None]","February 9, 2024, 17:20",Dr. Miller,charlieoc@example.com,555-777-1234,complained about stiffness in joints and lack ...,
3,4,,Unknown,,Female,"[sore throat, Cough, sore throat]",Strep throat,"[penicillin, , azithromycin]",02/09/2024 12:00 PM,Dr. Brown,,,Patient experiencing throat pain for several d...,
4,5,David Patel,1991-06-21,Thirty-three,Male,"shortness of breath, dizziness, fatigue",Anemia,"[iron supplements, None, folic acid]",2024-09-02T20:10:00,Dr. White,davidp@hospital.com,,"reported frequent dizziness, low energy, recom...",


In [6]:
#Ensuring stopwords in english have dowloaded

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# Step 1: Remove Personally Identifiable Information (PII)


def load_clean_medical_data(file_path):
    """
    Loads a medical dataset from a JSON file and removes Personally Identifiable Information (PII).
    
    Steps:
    - Reads the file and converts it into a DataFrame.
    - Removes columns containing PII (e.g., 'name', 'email', 'phone', 'address').
    - Ensures the original dataset remains unchanged.
    
    Parameters:
    file_path (str): Path to the JSON file.

    Returns:
    pd.DataFrame: A cleaned DataFrame with PII removed.
    """

    # Open and Load the File:
    with open(file_path, 'r') as file:
        raw_medical_notes = json.load(file)

    # Convert JSON data into a Pandas DataFrame
    df = pd.DataFrame(raw_medical_notes)

    # Remove PII columns
    df_cleaned = df.drop(columns=["name", "email", "phone", "address"], errors="ignore")

    return df_cleaned

df_removePII = load_clean_medical_data('raw_medical_notes.txt')
display(df_removePII)

Unnamed: 0,patient_id,dob,age,gender,symptoms,diagnosis,medications,timestamp,doctor,notes
0,1,1992-03-14,Thirty-Two,FEMALE,"[headache, Nausea, None, Blurred Vision]",Migraine,"[sumatriptan, ibuprofen , None, Naproxen]",2024/02/09 15:45,Dr. Smith,Patient reported severe pain in head and sensi...
1,2,1987-07-29,37,male,"fever, cough, body ache",,"[ Tylenol, amoxicillin , ibuprofen]",09-02-2024 9:30 AM,Dr. Adams,Patient has persistent fever for the past 3 da...
2,3,05-15-1995,29,M,"[fatigue, joint pain, low appetite]",Rheumatoid arthritis,"[methotrexate, prednisone, None]","February 9, 2024, 17:20",Dr. Miller,complained about stiffness in joints and lack ...
3,4,Unknown,,Female,"[sore throat, Cough, sore throat]",Strep throat,"[penicillin, , azithromycin]",02/09/2024 12:00 PM,Dr. Brown,Patient experiencing throat pain for several d...
4,5,1991-06-21,Thirty-three,Male,"shortness of breath, dizziness, fatigue",Anemia,"[iron supplements, None, folic acid]",2024-09-02T20:10:00,Dr. White,"reported frequent dizziness, low energy, recom..."


In [47]:
# Step 2: convert all timestamps to a single standardized format: YYYY-MM-DD HH:MM:SS

def standardize_timestamp(timestamp):
    """
    Converts various timestamp formats into the standard format: YYYY-MM-DD HH:MM:SS.
    Ensures the original DataFrame remains unchanged.

    Parameters:
    timestamp (str): The input timestamp as a string.

    Returns:
    str or None: Standardized timestamp in 'YYYY-MM-DD HH:MM:SS' format or None if invalid.
    """
    
    if not isinstance(timestamp, str) or timestamp.lower() == "unknown":
        return None  # Handle non-string values or "unknown" timestamps
    
    # List of possible timestamp formats to check
    possible_formats = [
        "%Y/%m/%d %H:%M",       # Example: 2024/02/14 14:30
        "%m-%d-%Y %I:%M %p",    # Example: 02-14-2024 02:30 PM
        "%B %d, %Y, %H:%M",     # Example: February 14, 2024, 14:30
        "%Y-%m-%dT%H:%M:%S"     # Example: 2024-02-14T14:30:00 (ISO format) 
    ]
    for fmt in possible_formats:
        try:
            parsed_time = datetime.strptime(timestamp, fmt)  # Try to parse the timestamp
            return parsed_time.strftime("%Y-%m-%d %H:%M:%S")  # Convert to standard format
        except ValueError:
            continue # Move to the next format if fails
            
    return None  # If all formats fail, return None


df_standardtime = df_removePII.copy()
df_standardtime["timestamp"] = df_standardtime["timestamp"].apply(standardize_timestamp)

display(df_standardtime)


Unnamed: 0,patient_id,dob,age,gender,symptoms,diagnosis,medications,timestamp,doctor,notes
0,1,1992-03-14,Thirty-Two,FEMALE,"[headache, Nausea, None, Blurred Vision]",Migraine,"[sumatriptan, ibuprofen , None, Naproxen]",2024-02-09 15:45:00,Dr. Smith,Patient reported severe pain in head and sensi...
1,2,1987-07-29,37,male,"fever, cough, body ache",,"[ Tylenol, amoxicillin , ibuprofen]",2024-09-02 09:30:00,Dr. Adams,Patient has persistent fever for the past 3 da...
2,3,05-15-1995,29,M,"[fatigue, joint pain, low appetite]",Rheumatoid arthritis,"[methotrexate, prednisone, None]",2024-02-09 17:20:00,Dr. Miller,complained about stiffness in joints and lack ...
3,4,Unknown,,Female,"[sore throat, Cough, sore throat]",Strep throat,"[penicillin, , azithromycin]",,Dr. Brown,Patient experiencing throat pain for several d...
4,5,1991-06-21,Thirty-three,Male,"shortness of breath, dizziness, fatigue",Anemia,"[iron supplements, None, folic acid]",2024-09-02 20:10:00,Dr. White,"reported frequent dizziness, low energy, recom..."


In [49]:
#Step 3: Convert all text into lowercase for consistency


def convert_text_to_lowercase(df):
    """
    Converts all text (string) columns in a DataFrame to lowercase, including nested lists.
    Ensures the original DataFrame remains unchanged.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: DataFrame with all text converted to lowercase.
    """
    df_copy = df.copy()  # Create a copy to avoid modifying the original DataFrame

    def to_lower(value): #helper function
        """
        Helper function to convert text to lowercase.
        Handles strings directly and applies conversion to lists of strings recursively.
        """
        if isinstance(value, str):
            return value.strip().lower()  # Trim spaces & convert to lowercase
        elif isinstance(value, list):
            return [to_lower(item) for item in value if isinstance(item, str)]  # Process lists of strings
        return value  # Return unchanged if not a string or list

    # Apply lowercase conversion to all columns values
    df_copy = df_copy.apply(lambda col: col.map(to_lower) if col.dtype == "object" else col)

    return df_copy
    

df_lowercase = convert_text_to_lowercase(df_standardtime)

display(df_lowercase)

Unnamed: 0,patient_id,dob,age,gender,symptoms,diagnosis,medications,timestamp,doctor,notes
0,1,1992-03-14,thirty-two,female,"[headache, nausea, blurred vision]",migraine,"[sumatriptan, ibuprofen, naproxen]",2024-02-09 15:45:00,dr. smith,patient reported severe pain in head and sensi...
1,2,1987-07-29,37,male,"fever, cough, body ache",,"[tylenol, amoxicillin, ibuprofen]",2024-09-02 09:30:00,dr. adams,patient has persistent fever for the past 3 da...
2,3,05-15-1995,29,m,"[fatigue, joint pain, low appetite]",rheumatoid arthritis,"[methotrexate, prednisone]",2024-02-09 17:20:00,dr. miller,complained about stiffness in joints and lack ...
3,4,unknown,,female,"[sore throat, cough, sore throat]",strep throat,"[penicillin, , azithromycin]",,dr. brown,patient experiencing throat pain for several d...
4,5,1991-06-21,thirty-three,male,"shortness of breath, dizziness, fatigue",anemia,"[iron supplements, folic acid]",2024-09-02 20:10:00,dr. white,"reported frequent dizziness, low energy, recom..."


In [51]:
# Step 4 remove stopwords to improve text processing

stop_words = set(stopwords.words('english'))


def remove_stopwords(df):
    """
    Removes stopwords from all string columns in a DataFrame, including lists of strings.
    Ensures the original DataFrame remains unchanged.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: A new DataFrame with stopwords removed.
    """

    df_copy = df.copy() # Create a copy to avoid modifying the original DataFrame

    def clean_text(value): #helper function
        """Removes stopwords from a string or a list of strings."""
        if isinstance(value, str):
            # Split the string into words and filter out stopwords
            return ' '.join([word for word in value.split() if word.lower() not in stop_words])
        elif isinstance(value, list):
            # If the value is a list, clean each string item recursively
            return [clean_text(item) for item in value if isinstance(item, str)]
        return value  # # Return unchanged if it's not a string or list

    # Apply stopword removal only to text columns
    df_copy = df_copy.apply(lambda col: col.apply(clean_text) if col.dtype == "object" else col)

    return df_copy

df_stopwords = remove_stopwords(df_lowercase)

display(df_stopwords)


Unnamed: 0,patient_id,dob,age,gender,symptoms,diagnosis,medications,timestamp,doctor,notes
0,1,1992-03-14,thirty-two,female,"[headache, nausea, blurred vision]",migraine,"[sumatriptan, ibuprofen, naproxen]",2024-02-09 15:45:00,dr. smith,patient reported severe pain head sensitivity ...
1,2,1987-07-29,37,male,"fever, cough, body ache",,"[tylenol, amoxicillin, ibuprofen]",2024-09-02 09:30:00,dr. adams,"patient persistent fever past 3 days, advised ..."
2,3,05-15-1995,29,,"[fatigue, joint pain, low appetite]",rheumatoid arthritis,"[methotrexate, prednisone]",2024-02-09 17:20:00,dr. miller,complained stiffness joints lack energy
3,4,unknown,,female,"[sore throat, cough, sore throat]",strep throat,"[penicillin, , azithromycin]",,dr. brown,patient experiencing throat pain several days.
4,5,1991-06-21,thirty-three,male,"shortness breath, dizziness, fatigue",anemia,"[iron supplements, folic acid]",2024-09-02 20:10:00,dr. white,"reported frequent dizziness, low energy, recom..."


In [53]:
# Step 5 Identify and Handle missing or duplicated entries

def clean_dataframe(df):
    """
    Cleans a DataFrame by:
    - Handling missing values properly
    - Filling empty 'gender' values with 'No Gender'
    - Replacing empty strings with NaN and handling them appropriately
    - Removing duplicate rows
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: A cleaned version of the DataFrame.
    """
    
    df_cleaned = df.copy() # Create a copy to avoid modifying the original DataFrame

    # Convert lists -> tuples (Fix error when checking duplicates)
    for col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].apply(lambda x: tuple(x) if isinstance(x, list) else x)

    # Handle missing values in the 'gender' column
    if 'gender' in df_cleaned.columns:
        df_cleaned['gender'] = df_cleaned['gender'].replace("", pd.NA)  # Treat empty strings as missing
        df_cleaned['gender'] = df_cleaned['gender'].fillna('No Gender')  # Replace missing gender values with 'No Gender'

    # Replace empty strings in all object (text) columns with NaN
    text_columns = df_cleaned.select_dtypes(include='object').columns
    df_cleaned[text_columns] = df_cleaned[text_columns].replace("", pd.NA)

    # Fill missing values in text columns with "Unknown" for better clarity
    df_cleaned[text_columns] = df_cleaned[text_columns].fillna("Unknown")

    # Fill missing values in numeric columns with the median of the column
    numeric_columns = df_cleaned.select_dtypes(include='number').columns
    for col in numeric_columns:
        df_cleaned[col] = df_cleaned[col].fillna(df_cleaned[col].median())

    # Remove duplicate rows
    df_cleaned = df_cleaned.drop_duplicates()

    return df_cleaned




df_cleaned = clean_dataframe(df_stopwords)
display(df_cleaned)

Unnamed: 0,patient_id,dob,age,gender,symptoms,diagnosis,medications,timestamp,doctor,notes
0,1,1992-03-14,thirty-two,female,"(headache, nausea, blurred vision)",migraine,"(sumatriptan, ibuprofen, naproxen)",2024-02-09 15:45:00,dr. smith,patient reported severe pain head sensitivity ...
1,2,1987-07-29,37,male,"fever, cough, body ache",Unknown,"(tylenol, amoxicillin, ibuprofen)",2024-09-02 09:30:00,dr. adams,"patient persistent fever past 3 days, advised ..."
2,3,05-15-1995,29,No Gender,"(fatigue, joint pain, low appetite)",rheumatoid arthritis,"(methotrexate, prednisone)",2024-02-09 17:20:00,dr. miller,complained stiffness joints lack energy
3,4,unknown,Unknown,female,"(sore throat, cough, sore throat)",strep throat,"(penicillin, , azithromycin)",Unknown,dr. brown,patient experiencing throat pain several days.
4,5,1991-06-21,thirty-three,male,"shortness breath, dizziness, fatigue",anemia,"(iron supplements, folic acid)",2024-09-02 20:10:00,dr. white,"reported frequent dizziness, low energy, recom..."


In [55]:
# Main Function: Calls All Helper Functions 

def process_medical_notes(file_path):
    """
    Loads and cleans medical notes by:
    - Removing PII
    - Standardizing timestamps
    - Converting text to lowercase
    - Removing stopwords
    - Handling missing & duplicate entries

    Returns:
    Cleaned DataFrame
    saves the cleaned data in csv file called cleaned_medical_notes.txt
    """

    # Load dataset and remove PII
    df = load_clean_medical_data(file_path)
    if df is None:
        return None  # Return early if file is missing

    # Standardize timestamp if column exists
    if 'timestamp' in df.columns:
        df['timestamp'] = df['timestamp'].apply(standardize_timestamp)

    # Apply text preprocessing functions
    df = convert_text_to_lowercase(df)
    df = remove_stopwords(df)

    # Handle missing values and duplicates
    df_cleaned = clean_dataframe(df)
    df_cleaned.to_csv('cleaned_medical_notes.txt')

    return df_cleaned

processed_notes = process_medical_notes('raw_medical_notes.txt')
display(processed_notes)


Unnamed: 0,patient_id,dob,age,gender,symptoms,diagnosis,medications,timestamp,doctor,notes
0,1,1992-03-14,thirty-two,female,"(headache, nausea, blurred vision)",migraine,"(sumatriptan, ibuprofen, naproxen)",2024-02-09 15:45:00,dr. smith,patient reported severe pain head sensitivity ...
1,2,1987-07-29,37,male,"fever, cough, body ache",Unknown,"(tylenol, amoxicillin, ibuprofen)",2024-09-02 09:30:00,dr. adams,"patient persistent fever past 3 days, advised ..."
2,3,05-15-1995,29,No Gender,"(fatigue, joint pain, low appetite)",rheumatoid arthritis,"(methotrexate, prednisone)",2024-02-09 17:20:00,dr. miller,complained stiffness joints lack energy
3,4,unknown,Unknown,female,"(sore throat, cough, sore throat)",strep throat,"(penicillin, , azithromycin)",Unknown,dr. brown,patient experiencing throat pain several days.
4,5,1991-06-21,thirty-three,male,"shortness breath, dizziness, fatigue",anemia,"(iron supplements, folic acid)",2024-09-02 20:10:00,dr. white,"reported frequent dizziness, low energy, recom..."
