In [None]:
import pandas as pd
import numpy as np

# Load variable 'df' from file: patients.xlsx

def clean_data(df):

    # Drop column: 'Title', 'firstname', 'nickname', 'lastname', 'birthdate'
    df = df.drop(columns=['Title', 'firstname', 'nickname', 'lastname', 'birthdate'])

    # Replace missing values in column: 'patient_age' with values from column: 'patient_apx_age' and drop column: 'patient_apx_age'
    df['patient_age'] = df['patient_age'].replace('', pd.NA).fillna(df['patient_apx_age'])
    df = df.drop(columns=['patient_apx_age'])
    # Change column type to int64 for column: 'patient_age'
    df = df.astype({'patient_age': 'int64'})

    # Replace long labels and fill missing values in column: 'patient_zipcode' with 'Homeless'
    df['patient_zipcode'] = df['patient_zipcode'].replace(['', 'Experiencing Homelessness, no current ZIP Code', 'Non-Clallam County ZIP Code'], pd.NA).fillna('Homeless')

    # Change column type to string for column: 'patient_sex' and 'patient_zipcode'
    df = df.astype({'patient_sex': 'string', 'patient_zipcode': 'string'})

    # Replace Zip Codes in column: "patient_zipcode" with text labels
    zip_code_di = {"98363": "PA West", "98362": "PA East", "98331": "Forks", "98382": "Sequim", "98364": "Homeless", "98381": "Sekiu", "98365": "Homeless", "98386": "Homeless", "98357": "Neah Bay", "Other": "Homeless"}
    df['patient_zipcode'] = df['patient_zipcode'].replace(zip_code_di)

    # Replace missing values with "Uninsured" in column: 'patient_insurance'
    df = df.fillna({'patient_insurance': 'Uninsured'})
    df = df.replace({'MedicaidOther': 'Medicaid', 'Medicaid, Other': 'Medicaid', 'Private, Medicaid': 'Medicaid'})

    # Change column type to string for column: 'living_situation'
    df = df.astype({'living_situation': 'string'})

    # Fill NaN values in 'referral_time' with 0 before type casting
    # Combine 'referral_date' and 'referral_time' if 'referral_time' is not '00:00:00'
    # Drop column: 'referral_time'
    df['referral_time'] = df['referral_time'].fillna(0).astype('int64')
    df['referral_date'] = df.apply(
        lambda row: row['referral_date'] if row['referral_time'] == 0 
        else pd.to_datetime(str(row['referral_date'].date()) + ' ' + str(row['referral_time']).zfill(4)), axis=1)
    df = df.drop(columns=['referral_time'])

    # Fill NaN values in 'overdose_time' with 0 before type casting
    # Combine 'overdose_date' and 'overdose_time' if overdose_time is not '00:00:00'
    # Drop column: 'overdose_time'
    df['overdose_time'] = df['overdose_time'].fillna(0).astype('int64')
    df['overdose_date'] = df.apply(
        lambda row: row['overdose_date'] if row['overdose_time'] == 0 
        else pd.to_datetime(str(row['overdose_date'].date()) + ' ' + str(row['overdose_time']).zfill(4)), axis=1)
    df = df.drop(columns=['overdose_time'])

    # Change column type to string for column: 'delay_in_referral'
    # Calculate the difference in hours between referral_date and overdose_date
    # Define the conditions
    # Define the corresponding values
    # Apply the conditions and values
    # Drop the temporary column
    # Change column type to string for column: 'result'
    # Drop column: 'delay_in_referral'
    # Rename column 'result' to 'delay_in_referral'
    df = df.astype({'delay_in_referral': 'string'})
    df['time_diff_hours'] = (df['referral_date'] - df['overdose_date']).dt.total_seconds() / 3600
    conditions = [
        (df['referral_date'] == df['overdose_date']),
        (df['time_diff_hours'] < 24),
        (df['time_diff_hours'] >= 24) & (df['time_diff_hours'] < 72),
        (df['time_diff_hours'] >= 72) & (df['time_diff_hours'] < 168),
        (df['time_diff_hours'] >= 168) & (df['time_diff_hours'] < 672),
        (df['time_diff_hours'] >= 672),
        (df['cpm_disposition'] == 'Cancelled')
    ]
    values = [
        'CPM responded',
        '< 24hrs',
        '24-72hrs',
        '3-7 days',
        '1-4 weeks',
        '> 1 month',
        'Call cancelled'
    ]
    df['result'] = np.select(conditions, values, default='Other')
    df.drop(columns=['time_diff_hours'], inplace=True)
    df = df.astype({'result': 'string'})
    df = df.drop(columns=['delay_in_referral'])
    df = df.rename(columns={'result': 'delay_in_referral'})

    # Change column type to string for column: 'cpm_notification' and 'cpm_disposition'
    df = df.astype({"cpm_notification": "string", "cpm_disposition": "string"})

    # Drop column: 'referral_type', 'referral_personnel', 'referral_agency_category',
    # 'rp_details', 'ref_status', 'referral_closed_reason', 'referral_status_note',
    # 'encounter_type_cat1', 'encounter_type_cat2', 'encounter_type_cat3', 'incident_number',
    # 'cows1time', 'cows1', 'cows2time', 'cows2', 'cows3time', 'cows3', 'Item Type', 'Path',
    # 'Created', 'Modified', 'real_team_roi', 'Item Type2', 'Path3', 'referral_narrative',
    # 'referral_status'
    df = df.drop(
        columns=[
            'referral_type',
            'referral_personnel',
            'referral_agency_category',
            'rp_details',
            'ref_status',
            'referral_closed_reason',
            'referral_status_note',
            'encounter_type_cat1',
            'encounter_type_cat2',
            'encounter_type_cat3',
            'incident_number',
            'cows1time',
            'cows1',
            'cows2time',
            'cows2',
            'cows3time',
            'cows3',
            'Item Type',
            'Path',
            'Created',
            'Modified',
            'real_team_roi',
            'Item Type2',
            'Path3',
            'referral_narrative',
            'referral_status',
            'no_referral_reason',
            'contact_level_other',
            'no_accepting_agency_reason',
            'referral_other',
            'accepted_other'
        ]
    )
    
    # Replace missing values with "No data" in column: 'referral_agency'
    # Change column type to string for columns: 'referral_agency', 'referral_source', 'od_address'
    df = df.fillna(
        {
            "referral_agency": "No data",
            "referral_source": "No data",
            "od_address": "No data",
            "engagement_location": "No data",
        }
    )
    df = df.astype(
        {
            "referral_agency": "string",
            "referral_source": "string",
            "od_address": "string",
            "engagement_location": "string",
        }
    )

    # Replace missing values with "No data" in columns: 'number_of_nonems_onscene', 'number_of_ems_onscene', 'number_of_peers_onscene', 'number_of_police_onscene'
    # Replace all instances of "Unknown" with "No data" in columns: 'number_of_nonems_onscene', 'number_of_ems_onscene', 'number_of_peers_onscene', 'number_of_police_onscene'
    # Change column type to string for columns: 'number_of_nonems_onscene', 'number_of_ems_onscene', 'number_of_peers_onscene', 'number_of_police_onscene'
    df = df.fillna(
        {
            'number_of_nonems_onscene': 'No data',
            'number_of_ems_onscene': "No data",
            'number_of_peers_onscene': "No data",
            'number_of_police_onscene': "No data"
        }
    )

    df['number_of_nonems_onscene'] = df['number_of_nonems_onscene'].str.replace("Unknown", "No data", case=False, regex=False)
    df['number_of_ems_onscene'] = df['number_of_ems_onscene'].str.replace("Unknown", "No data", case=False, regex=False)
    df['number_of_peers_onscene'] = df['number_of_peers_onscene'].str.replace("Unknown", "No data", case=False, regex=False)
    df['number_of_police_onscene'] = df['number_of_police_onscene'].str.replace("Unknown", "No data", case=False, regex=False)

    df = df.astype(
        {
            'number_of_nonems_onscene': 'string',
            'number_of_ems_onscene': 'string',
            'number_of_peers_onscene': 'string',
            'number_of_police_onscene': 'string'
        }
    )

    # Replace missing values with "No data" in column: 'suspected_drug'
    # Replace all instances of ";#" with "/" in column: 'suspected_drug'
    # Change column type to string for column: 'suspected_drug'
    df = df.fillna(
        {
            "suspected_drug": "No data",
            "cpr_administered": "No data",
            "police_ita": "No data",
            "disposition": "No data",
            "transport_to_location": "No data",
            "transported_by": "No data"
        }
    )
    df['suspected_drug'] = df['suspected_drug'].str.replace(";#", ", ", case=False, regex=False)
    df = df.astype({'suspected_drug': 'string'})

    df = df.astype(
        {
            'cpr_administered': 'string',
            'police_ita': 'string',
            'disposition': 'string',
            'transport_to_location': 'string',
            'transported_by': 'string'
        }
    )

    # Replace missing values with "0" in column: 'narcan_doses_prior_to_ems'
    # Replace all instances of "None Given" with "0" in column: 'narcan_doses_prior_to_ems'
    # Replace all instances of "> 6" with "6" in column: 'narcan_doses_prior_to_ems'
    # Change column type to int64 for column: 'narcan_doses_prior_to_ems'
    df = df.fillna({'narcan_doses_prior_to_ems': "0"})
    df['narcan_doses_prior_to_ems'] = df['narcan_doses_prior_to_ems'].str.replace("None Given", "0", case=False, regex=False)
    df['narcan_doses_prior_to_ems'] = df['narcan_doses_prior_to_ems'].str.replace("> 6", "6", case=False, regex=False)
    df = df.astype({'narcan_doses_prior_to_ems': 'int64'})

    # Replace missing values with 0 in column: 'narcan_prior_to_ems_dosage'
    # Change column type to int64 for column: 'narcan_prior_to_ems_dosage'
    df = df.fillna(
        {
            'narcan_prior_to_ems_dosage': 0,
            'narcan_doses_by_ems': "0",
            'narcan_by_ems_dosage': 0,
            'leave_behind_narcan_amount': 0,
            'harmreduce_supplies_type': "No data",
            'persons_trained': 1,
            'contact_level_rediscovery': "No data",
            'contact_level_reflections': "No data",
            'contact_level_pbh': "No data",
            'bup_not_indicated_reason': "No data",
            'bup_already_prescribed': "No data",
            'bup_doses_admin': 0,
            'bup_doses_dosage': 0,
            'referral_to_wm_agency': "No data",
            'wm_accepting_agency': "No data",
            'overdose_recent': "No data",
            'last_opiate_use_days': 0,
            'hours_withdrawing': 0,
            'supplies_provided': 0,
            'referral_to_sud_agency': 0,
            'referral_rediscovery': 0,
            'referral_reflections': 0,
            'referral_pbh': 0,
            'accepted_rediscovery': 0,
            'accepted_reflections': 0,
            'accepted_pbh': 0,
            'is_bup_indicated': 0,
            'bup_admin': 0,
            'client_agrees_to_mat': 0,
            'referral_to_wd_management': 0,
            'withdrawals_only': 0,
            'withdrawals_from_narcan': 0
        }
    )

    df['narcan_doses_by_ems'] = df['narcan_doses_by_ems'].str.replace("None Given", "0", case=False, regex=False)

    df = df.astype(
        {
            "narcan_prior_to_ems_dosage": "int64",
            "narcan_doses_by_ems": "int64",
            "narcan_by_ems_dosage": "int64",
            "leave_behind_narcan_amount": "int64",
            "harmreduce_supplies_type": "string",
            "persons_trained": "int64",
            "contact_level_rediscovery": "string",
            "contact_level_reflections": "string",
            "contact_level_pbh": "string",
            "bup_not_indicated_reason": "string",
            "bup_already_prescribed": "string",
            "bup_doses_admin": "int64",
            "bup_doses_dosage": "int64",
            "referral_to_wm_agency": "string",
            "wm_accepting_agency": "string",
            "overdose_recent": "string",
            "last_opiate_use_days": "int64",
            "hours_withdrawing": "int64",
            "supplies_provided": "int64",
            "referral_to_sud_agency": "int64",
            "referral_rediscovery": "int64",
            "referral_reflections": "int64",
            "referral_pbh": "int64",
            "accepted_rediscovery": "int64",
            "accepted_reflections": "int64",
            "accepted_pbh": "int64",
            "is_bup_indicated": "int64",
            "bup_admin": "int64",
            "client_agrees_to_mat": "int64",
            "referral_to_wd_management": "int64",
            "withdrawals_only": "int64",
            "withdrawals_from_narcan": "int64"
        }
    )

    # Drop column: 'ID'
    df = df.drop(columns=['ID'])
    
    return df

# Loaded variable 'df' from URI: referrals_port.xlsx
df = pd.read_excel('referrals_port.xlsx')

df_clean = clean_data(df.copy())
df_clean.head()

# Save the data to _clean.csv and _clean.xlsx
df_clean.to_csv("referrals_port_clean.csv", index=False)
df_clean.to_excel("referrals_port_clean.xlsx", index=False)