# Initial Preprocessing

Cleans up and exports the original ACP dataset files into HDF5-compressed Pandas DataFrames.

AEData:
 - Raw data must be unzipped into a directory.
 - Default path for the raw data is `data/AEdata`. This can be modified from the `Notebook` class.

ICD10: 
 - Original spreadsheet for ICD-10 (March 2021) can be found [here](https://www.health.gov.za/icd-10-master-industry-table/).
 - This should be placed in the `data/ICD10` directory. 

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from multiprocessing import Pool
import logging
from collections import defaultdict
pd.set_option("display.max_columns", None)

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)

from dataset import SCIData, SCICols
%load_ext autoreload
%autoreload 1
%aimport dataset


In [2]:
class Notebook:
    DATA_DIR = Path("data")
    RAW_DIR = Path("data/AEdata")

    RUN_ALL = False

    # Enable ONLY if running as a script
    MULTITHREADING = False


## Profiling Functions

In [3]:
def columns_per_dtype(df):
    col_counts = {col: df[col].value_counts() for col in df.columns}
    new_dict = defaultdict(list)
    for k, v in df.dtypes.iteritems():
        new_dict[v].append(k)
    for k in new_dict.keys():
        print(f"{k}: {new_dict[k]}")


def col_counts_topn(df, n=10):
    col_counts = {col: df[col].value_counts() for col in df.columns}
    new_dict = defaultdict(list)
    for k, v in col_counts.items():
        new_dict[v.size].append(k)
    for k in sorted(new_dict.keys())[:n]:
        print(f"{k}: {new_dict[k]}")


## Event Dates File

In [4]:
def hdf5_correct_strings(df: pd.DataFrame):
    string_cols = df.select_dtypes(include="object").columns
    df[string_cols] = df[string_cols].applymap(str)
    return df


In [5]:
def convert_eddates(df: pd.DataFrame):
    # Manual Correction: The first ArrivalDtm column has values in
    # excel-style integer format (see https://stackoverflow.com/a/65460255/7662085)
    if df.ArrivalDtm.dtype != np.dtype("datetime64[ns]"):
        df["ArrivalDtm"] = pd.to_datetime(df.ArrivalDtm, unit="D", origin="1899-12-30")

    # Iterate columns pairwise, and stack them vertically into a single DF
    return pd.DataFrame(
        np.concatenate(
            [
                df[list(_)].dropna().values
                for _ in zip(df.columns[::3], df.columns[1::3])
            ]
        ),
        columns=["AESerial", "ArrivalDtm"],
    )


def convert_mainfiledates(df: pd.DataFrame):
    return hdf5_correct_strings(df)


def convert_event_dates(infile="Each event dateDONE.xlsx", outfile="event_dates.h5"):
    result = {}

    xlsx = pd.ExcelFile(Notebook.RAW_DIR / infile)
    convert_sheets = {
        "EDDATES": convert_eddates,
        "MainFileDates": convert_mainfiledates,
    }

    for sheet, converter in convert_sheets.items():
        result[sheet] = converter(pd.read_excel(xlsx, sheet, index_col=None))

    with pd.HDFStore(Notebook.DATA_DIR / outfile) as store:
        for name, df in result.items():
            store[name] = df

    return result.values()


if Notebook.RUN_ALL:
    convert_event_dates()


## SCI File

In [14]:
def process_SCI(xlsx):
    df = xlsx.copy()

    # Remove spaces from column names
    df.columns = df.columns.str.replace(" ", "")

    # Drop redundant columns
    df = df.drop(SCICols.redundant, axis=1)

    col_counts = {col: df[col].value_counts() for col in df.columns}

    # Drop c_ prefixed columns with no values in them
    df = df.drop(
        [
            col
            for col, count in col_counts.items()
            if count.size <= 2 and col.startswith("c_")
        ],
        axis=1,
    )

    # Replace NoNSw2d with np.NAN in all applicable columns
    df = df.replace({"NoNSw2d": np.NAN, "NoObW2D": np.NAN, "Noobw2d": np.NAN})

    df['c_Breathing_device'] = df.c_O2_device_or_air.copy()
    
    # Turn these 2-value string columns into binary
    binarise = {
        "AdmissionType": "Elective",
        "AdmissionArea": "Medical Assessment Area",
        "DischargeArea": "Assessment Area Discharge",
        "c_Nausea": "1 - Nausea present",
        "c_Vomiting": "1 - Vomiting since last round",
        "Gender": "Female",
        "c_O2_device_or_air": "A - Air",
        "c_Patient_Position": "Lying",
        "c_Level_of_consciousness": "A - Alert",
        **{
            _: "Yes"
            for _ in [
                "Over7Days",
                "Over14Days",
                "CareHome",
                "DiedDuringStay",
                "DiedWithin30Days",
            ]
        },
    }

    for col, true in binarise.items():
        df[col] = (
            df[col]
            .apply(true.__eq__)
            .apply(lambda x: np.nan if x == NotImplemented else x)
        )

    df["c_O2_device_or_air"] = df["c_O2_device_or_air"].replace(
        {True: False, False: True}
    )

    # Convert CFS dates
    df.Noofminsafteradmission = df.AdmissionDateTime + pd.to_timedelta(
        df.Noofminsafteradmission, unit="m"
    )
    df.NoMinsBeforeadmission = df.AdmissionDateTime + pd.to_timedelta(
        df.NoMinsBeforeadmission, unit="m"
    )

    # Rename some of the binarised columns for better clarity
    df = df.rename(
        columns={
            "AdmissionType": "AdmittedAfterAEC",
            "AdmissionArea": "AssessmentAreaAdmission",
            "DischargeArea": "AssessmentAreaDischarge",
            "c_Vomiting": "VomitingSinceLastRound",
            "SpellDischargeDate": "DischargeDateTime",
            "Gender": "Female",
            "DischargeDestinationDescription": "DischargeDestination",
            "c_O2_device_or_air": "c_AssistedBreathing",
            "c_Patient_Position": "c_LyingDown",
            "c_Breathing_device": 'c_BreathingDevice',
            "c_Level_of_consciousness": "AVCPU_Alert",
            "c_Heart_rate":'HeartRate',
            'c_BP_Diastolic': 'DiastolicBP',
            'c_BP_Systolic': 'SystolicBP',
            "Noofminsafteradmission": "CFSAfterAdmissionDate",
            "NoMinsBeforeadmission": "CFSBeforeAdmissionDate",
            "CFSBeforeadmission": "CFSBeforeAdmission",
            'AdmissionWardLOS': 'AdmitWardLOS',
            'NextWardLOS2': 'NextWard2LOS',
            'NextWardLOS3': 'NextWard3LOS',
            'NextWardLOS4': 'NextWard4LOS',
            'NextWardLOS5': 'NextWard5LOS',
            'NextWardLOS6': 'NextWard6LOS',
            'NextWardLOS7': 'NextWard7LOS',
            'NextWardLOS8': 'NextWard8LOS',
            'NextWardLOS9': 'NextWard9LOS',
            'DischargeWardLOS': 'DischargeWardLOS',
            "Urea(serum)" : "Urea_serum",
            "Sodium(serum)" : "Sodium_serum",
            "Potassium(serum)" : "Potassium_serum",
        }
    )

    df.columns = [_[2:] if _ .startswith('c_') else _ for _ in df.columns]

    df.Pain = df.Pain.map(
        {
            "0 - No pain": 0,
            "1 - Mild pain": 1,
            "2 - Moderate pain": 2,
            "3 - Severe pain": 3,
        }
    )

    # Convert NEWS dates
    datetimes = ["NewsCreatedWhen", "NewsTouchedWhen", "NewsAuthoredDtm"]
    df[datetimes] = df[datetimes].apply(pd.to_datetime, errors="coerce")

    # Convert blood results
    # Ignore certain non-numeric entries as they make up less than 0.001%
    numeric = [
        "Urea_serum",
        "Sodium_serum",
        "Potassium_serum",
        "Creatinine",
        "pO2(POC)Venous",
    ]
    df[numeric] = df[numeric].apply(pd.to_numeric, errors="coerce")

    # Drop duplicates based on serial code
    df = df.sort_values("SEQ", ascending=False).drop_duplicates("SpellSerial")

    df = df.replace("nan", np.nan)

    df = df[df.AdmissionDateTime >= pd.Timestamp('2015-01-01')]

    df = df[~df.AdmissionMethodDescription.isin(['TRAUMA ELECTIVE ADM', 'MATERNITY ANTE NATAL', 'WAITING LIST'])]

    return df[SCICols.ordered()].reset_index(drop=True)


In [8]:
if Notebook.RUN_ALL or True:
    infile = Notebook.RAW_DIR / "Copy of SCI11868 Delivered 7 Ian Browne.xlsx"
    outfile = Notebook.DATA_DIR / "sci.h5"

    logging.info(f"Reading file: {infile}")
    xlsx = pd.read_excel(infile)

    logging.info(f"Processing file: {infile}")
    df = process_SCI(xlsx)

    logging.info(f"Writing to: {outfile}/table")
    df.to_hdf(outfile, key="table")


2022-10-05 12:56:34,044 [INFO] Reading file: data\AEdata\Copy of SCI11868 Delivered 7 Ian Browne.xlsx


## Admissions Files

In [9]:
def reconstruct_datetime(df, old, new):
    df[new] = pd.to_datetime(
        df[old]
        .dropna(how="any")
        .astype(str)
        .replace("\.0", "", regex=True)
        .apply(" ".join, 1),
        format="%Y %m %W %A %H",
    )


def process_AD(xlsx):
    df = xlsx.copy()

    # Construct DateTime from the individual columns describing admission/discharge date
    adm_dt, disch_dt = (
        ["YearAdmit", "MonthAdmit", "AdmitWeek", "AdmitDay", "AdmitHour"],
        ["YearDisch", "MonthDisch", "DischWeek", "DischDay", "DischHour"],
    )
    reconstruct_datetime(df, adm_dt, "AdmissionDateTime")
    if "SpellDischargeDate" in df.columns:
        df = df.rename(columns={"SpellDischargeDate": "DischargeDateTime"})
    else:
        reconstruct_datetime(df, disch_dt, "DischargeDateTime")

    # Turn these 2-value string columns into binary
    binarise = {
        "AdmissionType": "Elective",
        "AdmissionArea": "Medical Assessment Area",
        "DischargeArea": "Assessment Area Discharge",
        **{
            _: "Yes"
            for _ in [
                "Over7Days",
                "Over14Days",
                "CareHome",
                "DiedDuringStay",
                "DiedWithin30Days",
            ]
        },
    }

    for col, true in binarise.items():
        df[col] = df[col].apply(true.__eq__).apply(lambda x: np.nan if x == NotImplemented else x)

    # Rename some of the binarised columns for better clarity
    df = df.rename(
        columns={
            "AdmissionType": "ElectiveAdmission",
            "AdmissionArea": "AssessmentAreaAdmission",
            "DischargeArea": "AssessmentAreaDischarge",
        }
    )

    # Drop individual date component columns and some extraneous ones
    df = df.drop(
        adm_dt
        + disch_dt
        + [
            "DischargeFYear",
            "AdmissionFYear",
            "AdmissionFYMonth",
            "AdmissionConsultant",
            "LastConsultant",
            "Area",
            "PCT",
            "GPPractice",
            "AdmissionWardEndDate",
        ],
        axis=1,
        errors="ignore",
    )

    # Drop duplicates based on serial code
    df = df.drop_duplicates("SpellSerial")

    return df.reset_index(drop=True)


In [10]:
def process_AD_single(infile):
    logging.info(f"Reading file: {infile}")
    xlsx = pd.read_excel(infile)

    logging.info(f"Processing file: {infile}")
    return process_AD(xlsx)


if Notebook.RUN_ALL:
    indir = "AD"
    outfile = Notebook.DATA_DIR / "AD.h5"
    results = []

    if Notebook.MULTITHREADING:
        with Pool(3) as p:
            results = p.map(
                process_AD_single, list((Notebook.RAW_DIR / indir).iterdir())
            )
    else:
        for infile in (Notebook.RAW_DIR / indir).iterdir():
            results.append(process_AD_single(infile))

    logging.info(f"Writing all to {outfile}")
    r = pd.concat(results)
    r.to_hdf(outfile, key="table")


# ICD-10 File

In [11]:
def process_icd10(xlsx):
    ICD10_3_Codes = (
        xlsx[
            [
                "Chapter_No",
                "Chapter_Desc",
                "Group_Code",
                "Group_Desc",
                "ICD10_3_Code",
                "ICD10_3_Code_Desc",
            ]
        ]
        .drop_duplicates("ICD10_3_Code")
        .set_index("ICD10_3_Code")
    )
    ICD10_Codes = xlsx[["ICD10_Code", "ICD10_3_Code", "WHO_Full_Desc"]].set_index(
        "ICD10_Code"
    )

    return ICD10_3_Codes, ICD10_Codes


if Notebook.RUN_ALL:
    infile = Notebook.DATA_DIR / "ICD10/ICD-10_MIT_2021_Excel_16-March_2021.xlsx"
    outfile = Notebook.DATA_DIR / "icd10.h5"

    logging.info(f"Reading file: {infile}")
    xlsx = pd.read_excel(infile, sheet_name="SA ICD-10 MIT 2021")

    logging.info(f"Processing file: {infile}")
    tc, c = process_icd10(xlsx)

    logging.info(f"Writing to: {outfile}")
    with pd.HDFStore(outfile) as store:
        store["ICD10_3_Codes"], store["ICD10_Codes"] = tc, c

In [12]:
if Notebook.RUN_ALL:
    infile = Notebook.DATA_DIR / "ICD10/Birkmeyer.csv"
    outfile = Notebook.DATA_DIR / "birkmeyer_icd10.h5"
    logging.info(f'Reading file: {infile}')
    df = pd.read_csv(infile)

    logging.info(f"Processing file: {infile}")
    #df = process_birkmeyer(df)

    logging.info(f"Writing to: {outfile}")
    df.to_hdf(outfile, 'table')

# CCS Files

In [13]:
def process_CCS(xlsx):
    df = (
        xlsx[xlsx["Inpatient Default CCSR (Y/N/X)"].eq("Y")]
        .copy()
        .rename(
            columns={
                "ICD-10-CM Code": "ICD10Code",
                "CCSR Category": "CCSCode",
                "CCSR Category Description": "CCSDescription",
                "Rationale": "CCSRationale",
            }
        )
        .drop(
            [
                "ICD-10-CM Code Description",
                "Inpatient Default CCSR (Y/N/X)",
                "Outpatient Default CCSR (Y/N/X)",
            ],
            axis=1,
        )
    )

    # Add dots to ICD-10 codes
    # mask = df.ICD10Code.str.len() > 3
    # df.loc[mask, "ICD10Code"] = (
    #     df.loc[mask, "ICD10Code"].str[:3] + "." + df.loc[mask, "ICD10Code"].str[3:]
    # )
    df = df.set_index('ICD10Code')

    return df


In [14]:
def process_CCS2():
    infile = 'data/ICD10/Classification of variables HSMR 2018.xlsx'
    codes, aggregate = pd.read_excel(infile, 'ICD10'), pd.read_excel(infile, 'Aggregate'), 
    shmi = pd.read_excel('data/ICD10/SHMI diagnosis group breakdown, Mar21-Feb22.xlsx', "Diagnosis group descriptions")
    
    codes.columns = ['ICD10Code', 'CCSGroup', 'CCSGroupDescription']
    aggregate.columns = ['CCSGroup', 'CCSGroupDescription', 'DiagnosisGroup', 'DiagnosisGroupDescription', 'AggregateGroup', 'AggregateGroupDescription']
    shmi.columns = ['SHMIGroup', 'CCSGroup', 'CCSGroupDescription', 'SHMIGroupDescription']

    codes, aggregate = codes.set_index('ICD10Code'), aggregate.set_index('CCSGroup')

    shmi.CCSGroup = shmi.CCSGroup.str.split(', ')
    shmi = shmi.explode('CCSGroup').set_index('CCSGroup')
    shmi.index = pd.to_numeric(shmi.index)
    shmi.SHMIGroup = pd.to_numeric(shmi.SHMIGroup)

    return codes, aggregate.drop(['DiagnosisGroup', 'DiagnosisGroupDescription', 'CCSGroupDescription'], axis=1), shmi.sort_index().drop('CCSGroupDescription', axis=1)

In [15]:
if Notebook.RUN_ALL:
    codes, aggregate, shmi = process_CCS2()
    with pd.HDFStore(Notebook.DATA_DIR / 'ccs.h5') as store:
        store['codes'] = codes
        store['shmi'] = shmi
        store['hsmr'] = aggregate
    

In [16]:
if Notebook.RUN_ALL:
    infile = Notebook.DATA_DIR / "ICD10/DXCCSR-Reference-File-v2022-1.xlsx"
    outfile = Notebook.DATA_DIR / "ccs_icd10.h5"
    xlsx = pd.read_excel(infile, "DX_to_CCSR_Mapping")

    df = process_CCS(xlsx)

    df.to_hdf(outfile, 'table')

