In [None]:
import numpy as np
import pandas as pd

def clean_data(df):

    # Select columns: 'ID', '3C Client' ... 'veteran_status'
    df = df.loc[
        :,
        [
    	    "ID",
    		"3C Client",
    		"Age",
    		"Approximate Age",
    		"Case Management",
    		"gender",
    		"insurance",
    		"Insurance2",
    		"patient_sex",
    		"pcp_agency_category",
    		"pcp_agency",
    		"pcp_agency_1",
    		"secondary_agencies",
    		"race",
    		"Sex",
    		"SUD",
    		"Zip Code",
    		"zipcode",
    		"Created",
    		"Modified",
    		"marital_status",
    		"veteran_status",
    	],
    ]

    # Rename column 'ID' to 'id'
    # Replace missing values with 0 in column: '3C Client'
    # Change column type to int64 for column: '3C Client'
    # Rename column '3C Client' to '3c_client'
    df = df.rename(columns={"ID": "id"})
    df = df.fillna({"3C Client": 0})
    df = df.astype({"3C Client": "int64"})
    df = df.rename(columns={"3C Client": "3c_client"})

    # Rename column 'Age' to 'age'
    # Rename column 'Approximate Age' to 'approx_age'
    # Replace missing values with 0 in column: 'approx_age'
    # Change column type to int64 for column: 'approx_age'
    # Replace 'age' with 'approx_age' if 'approx_age' is not 0
    # Drop column: 'approx_age'
    # Replace 'age' > 103 with median of 'age'
    # Change column type to int64 for column: 'age'
    df = df.rename(columns={"Age": "age", "Approximate Age": "approx_age"})
    df = df.fillna({"approx_age": 0})
    df = df.astype({"approx_age": "int64"})
    df.loc[df["approx_age"] != 0, "age"] = df["approx_age"]
    df = df.drop(columns=["approx_age"])
    median_age = df["age"].median()
    df["age"] = df["age"].apply(lambda x: median_age if x > 103 else x)
    df = df.astype({"age": "int64"})

    # Rename column 'Case Management' to 'case_management'
    df = df.rename(
        columns={
                "Case Management": "case_management",
                "Insurance2": "insurance_2",
                "Sex": "sex",
                "SUD": "sud",
                "Zip Code": "zip_code",
                "Created": "created_date",
                "Modified": "modified_date",
        }
    )

    # Replace missing values with 0 in column: 'case_management'
    # Change column type to int64 for column: 'case_management'
    df = df.fillna({"case_management": 0})
    df = df.astype({"case_management": "int64"})

    # Replace missing values with "Not disclosed" in column: 'gender'
    # Change column type to string for column: 'gender'
    # Replace values in 'pcp_agency' column
    df = df.fillna({"gender": "Not disclosed"})

    df["gender"] = df["gender"].replace(
        {
            "Select Gender": "Not disclosed",
            "Select Sex": "Not disclosed",
            "Nonbinary": "Non-binary or Other",
            "Nonbinary or Other": "Non-binary or Other"
        }
    )

    df = df.astype({"gender": "string"})

    # Replace missing values with "Not disclosed" in column: 'insurance'
    df = df.fillna({"insurance": "Not disclosed"})

    # Replace 'Not disclosed' in 'insurance' with 'insurance_2' value
    df.loc[df["insurance"] == "Not disclosed", "insurance"] = df["insurance_2"]

    # Split text using string ';#' in column: 'insurance'
    loc_0 = df.columns.get_loc("insurance")
    df_split = (
        df["insurance"].str.split(pat=";#", expand=True, n=1).add_prefix("insurance_")
    )
    df = pd.concat([df.iloc[:, :loc_0], df_split, df.iloc[:, loc_0:]], axis=1)
    df = df.drop(columns=["insurance"])

    # Replace missing values with "Not disclosed" in column: 'insurance_0'
    df = df.fillna({"insurance_0": "Not disclosed"})

    # Replace all instances of "Unknown" with "Not disclosed" in column: 'insurance_0'
    df["insurance_0"] = df["insurance_0"].str.replace(
        "Unknown", "Not disclosed", case=False, regex=False
    )

    # Replace all instances of "Not Applicable" with "Not disclosed" in column: 'insurance_0'
    df["insurance_0"] = df["insurance_0"].str.replace(
        "Not Applicable", "Not disclosed", case=False, regex=False
    )

    # Rename column 'insurance_0' to 'insurance'
    df = df.rename(columns={"insurance_0": "insurance"})

    # Replace all instances of "Other" with "Private" in column: 'insurance'
    df["insurance"] = df["insurance"].replace({"Other": "Private"})

    # Change column type to string for column: 'insurance'
    df = df.astype({"insurance": "string"})

    # Drop columns: 'insurance_1', 'insurance_2', 'patient_sex'
    df = df.drop(columns=["insurance_1", "insurance_2", "patient_sex", "pcp_agency_category"])

    # Fill missing 'pcp_agency' with 'pcp_agency_1'
    df["pcp_agency"] = df["pcp_agency"].fillna(df["pcp_agency_1"])

    # Replace missing values with "Not disclosed" in column: 'pcp_agency'
    df = df.fillna({"pcp_agency": "Not disclosed"})

    # Replace all instances of "Medical N" with "NOHN - Medical" in column: 'pcp_agency'
    df["pcp_agency"] = df["pcp_agency"].str.replace("Medical N", "NOHN - Medical", case=False, regex=False)

    # Update 'pcp_agency' with 'secondary_agencies' if 'No data'
    df["pcp_agency"] = df.apply(
        lambda row: (
            row["secondary_agencies"]
            if row["pcp_agency"] == "No data" and pd.notna(row["secondary_agencies"])
            else row["pcp_agency"]
        ),
        axis=1,
    )

    # Drop columns: 'pcp_agency_1', 'secondary_agencies'
    df = df.drop(columns=["pcp_agency_1", "secondary_agencies"])

    # Split text using string ';#' in column: 'pcp_agency'
    loc_0 = df.columns.get_loc("pcp_agency")
    df_split = (
        df["pcp_agency"].str.split(pat=";#", expand=True, n=1).add_prefix("pcp_agency_")
    )
    df = pd.concat([df.iloc[:, :loc_0], df_split, df.iloc[:, loc_0:]], axis=1)

    # Drop column: 'pcp_agency'
    # Rename column 'pcp_agency_0' to 'pcp_agency'
    # Change column type to string for column: 'pcp_agency'
    df = df.drop(columns=["pcp_agency"])
    df = df.rename(columns={"pcp_agency_0": "pcp_agency"})

    # Replace values in 'pcp_agency' column
    df["pcp_agency"] = df["pcp_agency"].replace(
        {
            "Behavioral N": "NOHN - Behavioral",
            "CCHHS HRC": "CCHHS - HRC",
            "O - MOUD": "OPCC - MOUD",
            "O- LEAD FIRE": "OPCC - LEAD FIRE",
            "O- Outreach": "OPCC Outreach",
            "OBOT N": "NOHN - OBOT",
            "Medical N": "NOHN - Medical",
            "Unknown": "Not disclosed"
        }
    )
    df = df.astype({"pcp_agency": "string"})

    # Replace missing values with "No data" in column: 'race'
    # Replace all instances of "Select Race" with "No data" in column: 'race'
    # Replace all instances of "Unknown" with "Not disclosed" in column: 'race'
    # Change column type to string for column: 'race'
    df = df.fillna({"race": "Not disclosed"})
    df["race"] = df["race"].replace(
        {
            "Select Race": "Not disclosed",
            "Unknown": "Not disclosed",
            "Black": "Black/African American",
            "Black or African American": "Black/African American",
        }
    )
    df = df.astype({"race": "string"})

    # Replace gaps forward from the previous valid value in: 'sex'
    df = df.fillna({"sex": df["sex"].ffill()})

    # Replace 'Select Sex' with random 'Female' or 'Male'
    df["sex"] = df["sex"].apply(
        lambda x: np.random.choice(["Female", "Male"]) if x == "Select Sex" else x
    )

    # Change column type to string for column: 'sex'
    df = df.astype({"sex": "string"})

    # Replace missing values with 0 in column: 'sud'
    # Change column type to bool for column: 'sud'
    df = df.fillna({"sud": 0})
    df = df.astype({"sud": "bool"})

    # Fill missing 'zipcode' with 'zip_code' values
    df["zipcode"] = df["zipcode"].fillna(df["zip_code"])

    # Drop column: 'zip_code'
    df = df.drop(columns=["zip_code"])

    # Replace missing values with "Not disclosed" in column: 'zipcode'
    df = df.fillna({"zipcode": "Not disclosed"})

    # Split text using string ';#' in column: 'zipcode'
    loc_0 = df.columns.get_loc("zipcode")
    df_split = df["zipcode"].str.split(pat=";#", expand=True, n=1).add_prefix("zipcode_")
    df = pd.concat([df.iloc[:, :loc_0], df_split, df.iloc[:, loc_0:]], axis=1)

    # Drop columns: 'zipcode', 'zipcode_1'
    df = df.drop(columns=["zipcode", "zipcode_1"])

    # Rename column 'zipcode_0' to 'zip_code'
    df = df.rename(columns={"zipcode_0": "zip_code"})

    # Replace all instances of "Experiencing Homelessness, no current ZIP Code" with "Homeless/Transient" in column: 'zip_code'
    df["zip_code"] = df["zip_code"].replace(
        {
            "Experiencing Homelessness, no current ZIP Code": "Homeless/Transient",
            "Unknown": "Not disclosed",
            "Other": "Not disclosed"
        }
    )

    # Change column type to string for column: 'zip_code'
    df = df.astype({"zip_code": "string"})

    # Replace missing values with "Not disclosed" in column: 'marital_status'
    # Replace all instances of "Single" with "Not Married/Widowed" in column: 'zip_code'
    # Change column type to string for column: 'marital_status'
    df = df.fillna({"marital_status": "Not disclosed"})
    df["marital_status"] = df["marital_status"].replace(
        {"Single": "Not Married/Widowed"}
    )
    df = df.astype({"marital_status": "string"})

    # Replace missing values with "Not disclosed" in column: 'veteran_status'
    # Change column type to string for column: 'veteran_status'
    df = df.fillna({"veteran_status": "Not disclosed"})
    df = df.astype({"veteran_status": "string"})

    return df

# Loaded variable 'df' from file: patients.xlsx
df = pd.read_excel("patients.xlsx")

df_clean = clean_data(df.copy())
df_clean.head()

# Output the cleaned data to CSV and EXCEL files
df_clean.to_csv("patients_clean.csv", index=False)
df_clean.to_excel("patients_clean.xlsx", index=False)