In [1]:
import pandas as pd
import random

def clean_data(df):
    # Select columns: 'ID', 'referral_ID' and 10 other columns
    df = df.loc[
        :,
        [
            'ID',
            'referral_ID',
            'port_referral_ID',
            'patient_ID',
            'encounter_date',
            'encounter_time',
            'pcp_agency_category',
            'pcp_agency',
            'encounter_agency',
            'encounter_type_cat1',
            'encounter_type_cat2',
            'encounter_type_cat3'
        ]
    ]

    # Replace missing values with 0 in column: 'referral_ID'
    df = df.fillna({'referral_ID': 0})

    # Change column type to int64 for column: 'referral_ID'
    df = df.astype({'referral_ID': 'int64'})

    # Replace missing values with 0 in column: 'port_referral_ID'
    df = df.fillna({'port_referral_ID': 0})

    # Change column type to int64 for column: 'port_referral_ID'
    df = df.astype({'port_referral_ID': 'int64'})

    # Replace missing values with 0 in column: 'patient_ID'
    df = df.fillna({'patient_ID': 0})

    # Change column type to int64 for column: 'patient_ID'
    df = df.astype({'patient_ID': 'int64'})

    # Change column type to datetime64[ns] for column: 'encounter_date'
    df = df.astype({'encounter_date': 'datetime64[ns]'})

    # Replace missing values with 0 in column: 'encounter_time'
    df = df.fillna({'encounter_time': 0})

    # Change column type to int64 for column: 'encounter_time'
    df = df.astype({'encounter_time': 'int64'})

    # Fill missing 'pcp_agency' with 'encounter_agency'
    df['pcp_agency'] = df['pcp_agency'].fillna(df['encounter_agency'])

    # Drop column: 'pcp_agency_category'
    df = df.drop(columns=['pcp_agency_category'])

    # Drop column: 'encounter_agency'
    df = df.drop(columns=['encounter_agency'])

    # Replace '[]' in 'pcp_agency' with random values from the list
    df['pcp_agency'] = df['pcp_agency'].apply(lambda x: random.choice(
        [
            'NOHN - Medical',
            'PBH - Medical',
            'PBH - Behavioral',
            'OMC - Primary',
            'OMC - Case Management'
        ]
    ) if x == '[]' else x)

    # Replace missing values with "No data" in column: 'encounter_type_cat1'
    df = df.fillna({'encounter_type_cat1': "No data"})

    # Replace missing values with "No data" in column: 'encounter_type_cat2'
    df = df.fillna({'encounter_type_cat2': "No data"})

    # Replace missing values with "No data" in column: 'encounter_type_cat3'
    df = df.fillna({'encounter_type_cat3': "No data"})

    df["pcp_agency"] = df["pcp_agency"].replace(
        {
            "Case Management N": "NOHN - Case Management",
            "Medical N": "NOHN - Medical",
            "Medical Respite": "OPCC - Medical Respite",
            "O - MOUD": "OPCC - MOUD",
            "O- Outreach": "OPCC - Outreach",
            "O- LEAD FIRE": "OPCC - LEAD FIRE",
            "O- Medical Respite": "OPCC - Medical Respite",
            "O3A": "Olympic Area Agency on Aging",
            "Other": "Other Organization",
            "REdisCOVERY": "OPCC - REdisCOVERY",
            "Reflections": "Reflections Counseling Services",
            "Station Walk-In": "911 Call/Walk-In"
        }
    )

    df = df.astype({"pcp_agency": "string", "encounter_type_cat1": "string", "encounter_type_cat2": "string", "encounter_type_cat3": "string"})
    
    return df

# Loaded variable 'df' from file encounters.csv
df = pd.read_csv('encounters.csv')

df_clean = clean_data(df.copy())
df_clean.head()

# Save clean data to files 'encounters_clean.csv' and 'encounters_clean.xlsx'
df_clean.to_csv('encounters_clean.csv', index=False)
df_clean.to_excel('encounters_clean.xlsx', index=False)