# Feature Extraction

In [161]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)


In [162]:
sci = pd.read_hdf("data/sci.h5", "table")
icd10 = pd.read_hdf("data/icd10.h5", "ICD10_3_Codes")
birk = pd.read_hdf("data/birkmeyer_icd10.h5", "table")


### COVID-19

COVID-19 is coded as:
 - `U07.1`: COVID-19, virus identified
 - `J12.8`: Other viral pneumonia
 - `B97.2`: Coronavirus as the cause of diseases classified to other chapters

[(Source)](https://hscic.kahootz.com/t_c_home/view?objectID=28993424)

In [163]:
def derive_covid(df):
    year_mask = df.AdmissionDateTime >= "2020-01-01"
    covid_mask = (
        df[
            [
                "MainICD10",
                "SecDiag1",
                "SecDiag2",
                "SecDiag3",
                "SecDiag4",
                "SecDiag5",
                "SecDiag6",
            ]
        ]
        .isin(["U07.1", "J12.8", "B97.2"])
        .any(axis=1)
    )

    r = df.copy()
    r["Covid"] = year_mask & covid_mask
    return r



In [175]:
def derive_readmission(df):
    bins = [
        pd.Timedelta(days=0),
        pd.Timedelta(days=1),
        pd.Timedelta(days=2),
        pd.Timedelta(days=7),
        pd.Timedelta(days=14),
        pd.Timedelta(days=30),
        pd.Timedelta(days=60),
    ]
    labels = ["24 Hrs", "48 Hrs", "1 Week", "2 Weeks", "1 Month", "2 Months"]

    r = df.copy()
    r["Readmission"] = (
        r.sort_values(["PatientNumber", "AdmissionDateTime"])
        .groupby("PatientNumber")
        .AdmissionDateTime.diff()
        .dropna()
    )
    r["ReadmissionBand"] = pd.cut(r.Readmission, bins, labels=labels).astype(str).replace('nan', np.nan)
    return r



In [165]:
def derive_icd10_3_code(df):
    r = df.copy()
    r["MainICD10_3_Code"] = df.MainICD10.str[:3]
    return r

derive_icd10_3_code(sci).MainICD10_3_Code.value_counts()

In [166]:
def derive_birk_grouping(df, birk):
    return df.join(
        birk.drop_duplicates("ICD10Code_Closest")
        .set_index("ICD10Code_Closest")
        .MedicalCondition.rename("Birk_Grouping"),
        on="MainICD10",
    )

derive_birk_grouping(sci, birk).Birk_Grouping.value_counts()

In [167]:
def derive_mortality(df):
    r = df.copy()
    m = df[["DiedDuringStay", "DiedWithin30Days"]].copy()
    m["DidNotDie"] = ~m.any(axis=1)
    r["Mortality"] = m.dot(m.columns)
    return r


derive_mortality(sci).Mortality.value_counts()


DidNotDie           159537
DiedDuringStay        6507
DiedWithin30Days      4789
Name: Mortality, dtype: int64

In [168]:
def derive_critical(df):
    wards = [
        "AdmitWard",
        "NextWard2",
        "NextWard3",
        "NextWard4",
        "NextWard5",
        "NextWard6",
        "NextWard7",
        "NextWard8",
        "NextWard9",
    ]
    r = df.copy()
    r["Critical_Care"] = (r[wards] == "CCU").any(axis=1)
    return r


derive_critical(sci).Critical_Care.value_counts()


False    169680
True       1153
Name: Critical_Care, dtype: int64

In [197]:
all_wards = sci[[
        "AdmitWard",
        "NextWard2",
        "NextWard3",
        "NextWard4",
        "NextWard5",
        "NextWard6",
        "NextWard7",
        "NextWard8",
        "NextWard9",
    ]].apply(pd.Series.value_counts, axis=0)

In [224]:
[_ for _ in all_wards.index if _.startswith('S')]

['SAL', 'SAL1', 'SBU', 'SRU', 'STU']

In [221]:
scii[[
        "AdmitWard",
        "NextWard2",
        "NextWard3",
        "NextWard4",
        "NextWard5",
        "NextWard6",
        "NextWard7",
        "NextWard8",
        "NextWard9",
    ]].isin(['HH1M', 'HH1R', 'HH2', 'HH3', 'HH4', 'HH5', 'HH6', 'HH6M', 'HH7', 'HH8', ]).any(axis=1).sum()

10369

In [178]:
def derive_all(df):
    df = derive_icd10_3_code(df)
    df = derive_birk_grouping(df, birk)
    df = derive_covid(df)
    df = derive_critical(df)
    df = derive_mortality(df)
    df = derive_readmission(df)
    return df


scii = derive_all(sci)


In [181]:
scii.MainICD10_3_Code

0         T43
1         M25
2         R07
3         T44
4         T43
         ... 
170828    R07
170829    G40
170830    J18
170831    M54
170832    R10
Name: MainICD10_3_Code, Length: 170833, dtype: object

In [179]:
scii.to_hdf('data/sci_2.h5', 'table')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block4_values] [items->Index(['SpellSerial', 'PatientType', 'IntendedManagement',
       'AdmissionMethodDescription', 'AdmissionSpecialty', 'LastSpecialty',
       'AdmitWard', 'NextWard2', 'NextWard3', 'NextWard4', 'NextWard5',
       'NextWard6', 'NextWard7', 'NextWard8', 'NextWard9', 'DischargeWard',
       'LOSBand', 'Gender', 'AgeBand', 'AESerial', 'AandEPresentingComplaint',
       'AandEMainDiagnosis', 'AandELocation', 'AandEPatientGroupDescription',
       'MainICD10', 'MainDiagnosis', 'SecDiag1', 'SecDiag2', 'SecDiag3',
       'SecDiag4', 'SecDiag5', 'SecDiag6', 'MainOPCS4', 'MainProcedure',
       'SecOper1', 'SecOper2', 'SecOper3', 'SecOper4', 'SecOper5', 'SecOper6',
       'PrimarySpecialtyLocalCode', 'SpellHRG', 'HRGDesc',
       'DischargeDestinationDescription', 'AllCFS', 'PatientNoSeq', 'AllCFS.1',
       'AllDatesofCFSReadings', 'N