In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime

DATA_DIR = "/voyager/datasets/liver_transplant/SRTR/"
MELD_CONVERSION_DIR = "mas/data/reference_tables/meld_conversion.csv"

In [2]:
cand_liin = pd.read_sas(DATA_DIR + "cand_liin.sas7bdat")
mpexcept = pd.read_sas(DATA_DIR + "mpexcept.sas7bdat")
tx_li = pd.read_sas(DATA_DIR + "tx_li.sas7bdat")

In [4]:
txf_li = pd.read_sas(DATA_DIR + "txf_li.sas7bdat")

In [18]:
len(tx_li.PX_ID.unique())

183721

In [14]:
sum(pd.isna(tx_li.PX_ID))

0

In [7]:
len(txf_li.PX_ID.unique())

161757

In [8]:
def filter_candidates(df, mpexcept, tx_li):
    """
    Performs filtering of a dataframe of transplant candidates using inclusion/
    exclusion criteria determined in conjunction with our clinical collaborator.

    This function performs the following exclusion operations.
    1. Exclude patients listed oufside the time of interest (February 27, 2002 
    - December 1, 2021).
    2. Remove patients who received transplants but were never put on the
    waitlist or who were listed in error.
    3. Remove patients with a previous transplant.
    4. Remove patients with a multi-organ transplantation.
    5. Remove patients multi-listed, refused transplant, transferred, unable 
    to contact candidate, transplant in other country, removed in error.
    6. Remove non-adult patients.
    """
    df_rows = df.shape[0]
    print(f"Commencing filtering. DataFrame contains {df.shape[0]} rows.")
    # 1. Exclude patients outside the timeframe of interest.
    df = df.loc[(df['CAN_ACTIVATE_DT'] >= pd.to_datetime(datetime.date(2002, 2, 27))) 
                    & (df['CAN_ACTIVATE_DT'] <= pd.to_datetime(datetime.date(2021, 12, 1)))]
    
    print(f"EC 1: Dropping {df_rows - df.shape[0]} rows (outside the timeframe of interest).")
    df_rows = df.shape[0]
    
    # 2. Remove patients who received transplants but were never put on the
    # waitlist or who were listed in error. (N = 0)
    df = df.loc[(df['CAN_SOURCE'] != b'L') | # Live donor - so never on waitlist
                (df['CAN_REM_CD'] == 10) # Listed in error
                ]
    print(f"EC 2: Dropping {df_rows - df.shape[0]} rows (received transplants but never waitlisted, or waitlisted in error).")
    df_rows = df.shape[0]

    # 3. Remove patients with a previous transplant
    df = df.loc[(df['CAN_PREV_TX'] == 0) & 
                (df['CAN_PREV_HL'] == 0) &
                (df['CAN_PREV_HR'] == 0) &
                (df['CAN_PREV_IN'] == 0) &
                (df['CAN_PREV_KI'] == 0) &
                (df['CAN_PREV_KP'] == 0) &
                (df['CAN_PREV_LI'] == 0) &
                (df['CAN_PREV_LU'] == 0) &
                (df['CAN_PREV_PA'] == 0)]
    print(f"Dropping {df_rows - df.shape[0]} rows (previous transplant).")
    df_rows = df.shape[0]

    # 4. Remove patients with a multi-organ transplantation
    multi_organ_patients = set(
        tx_li.loc[tx_li["REC_TX_ORG_TY"] != b"LI"]["PX_ID"]
    )
    df = df[~df["PX_ID"].isin(multi_organ_patients)]
    print(f"Dropping {df_rows - df.shape[0]} rows (multi-organ transplant).")
    df_rows = df.shape[0]

    # 5. Remove patients multi-listed, refused transplant, transferred, unable 
    # to contact candidate, transplant in other country, removed in error.
    df = df.loc[(df["CAN_REM_CD"] != 14) &  # Multi-listed
                (df["CAN_REM_CD"] != 6) & # Refused transplant
                (df["CAN_REM_CD"] != 7) & # Transferred to another center
                (df["CAN_REM_CD"] != 24) & # Unable to contact candidate
                (df["CAN_REM_CD"] != 22) & # Transplant in another country
                (df["CAN_REM_CD"] != 16) # Candidate removed in error
                ]
    print(f"Dropping {df_rows - df.shape[0]} rows (multi-listed, refused transplant, transferred, unable to contact candidate, transplant in other country, removed in error).")
    df_rows = df.shape[0]

    # 6. Remove non-adult patients.
    # TODO - 18 is the cutoff, presumably?
    df = df.loc[df["CAN_AGE_IN_MONTHS_AT_LISTING"] >= 18*12]

    print(f"Dropping {df_rows - df.shape[0]} rows (non-adult patient).")
    df_rows = df.shape[0]

    print(f"Filtering complete. Filtered DataFrame contains {df.shape[0]} rows.")

    return df

In [22]:
def filter_candidates_unique(df, mpexcept, tx_li):
    """
    Performs filtering of a dataframe of transplant candidates using inclusion/
    exclusion criteria determined in conjunction with our clinical collaborator.

    This function performs the following exclusion operations.
    1. Exclude patients listed oufside the time of interest (February 27, 2002 
    - December 1, 2021).
    2. Remove patients who received transplants but were never put on the
    waitlist or who were listed in error.
    3. Remove patients with a previous transplant.
    4. Remove patients with a multi-organ transplantation.
    5. Remove patients multi-listed, refused transplant, transferred, unable 
    to contact candidate, transplant in other country, removed in error.
    6. Remove non-adult patients.
    """
    df_rows = len(tx_li.PX_ID.unique())
    print(f"begin with {df_rows} unique patients")
    print(f"Commencing filtering. DataFrame contains {df.shape[0]} rows.")
    # 1. Exclude patients outside the timeframe of interest.
    df = df.loc[(df['CAN_ACTIVATE_DT'] >= pd.to_datetime(datetime.date(2002, 2, 27))) 
                    & (df['CAN_ACTIVATE_DT'] <= pd.to_datetime(datetime.date(2021, 12, 1)))]
    
    print(f"EC 1: Dropping {df_rows - df.shape[0]} rows (outside the timeframe of interest).")
    df_rows = len(tx_li[tx_li.PX_ID.isin(df.PX_ID.unique())].PX_ID.unique())
    print(f"after EC 1, {df_rows} unique patients")
    
    # 2. Remove patients who received transplants but were never put on the
    # waitlist or who were listed in error. (N = 0)
    df = df.loc[(df['CAN_SOURCE'] != b'L') | # Live donor - so never on waitlist
                (df['CAN_REM_CD'] == 10) # Listed in error
                ]
    print(f"EC 2: Dropping {df_rows - df.shape[0]} rows (received transplants but never waitlisted, or waitlisted in error).")
    df_rows = len(tx_li[tx_li.PX_ID.isin(df.PX_ID.unique())].PX_ID.unique())
    print(f"after EC 2, {df_rows} unique patients")

    # 3. Remove patients with a previous transplant
    df = df.loc[(df['CAN_PREV_TX'] == 0) & 
                (df['CAN_PREV_HL'] == 0) &
                (df['CAN_PREV_HR'] == 0) &
                (df['CAN_PREV_IN'] == 0) &
                (df['CAN_PREV_KI'] == 0) &
                (df['CAN_PREV_KP'] == 0) &
                (df['CAN_PREV_LI'] == 0) &
                (df['CAN_PREV_LU'] == 0) &
                (df['CAN_PREV_PA'] == 0)]
    print(f"Dropping {df_rows - df.shape[0]} rows (previous transplant).")
    df_rows = len(tx_li[tx_li.PX_ID.isin(df.PX_ID.unique())].PX_ID.unique())
    print(f"after EC 3, {df_rows} unique patients")

    # 4. Remove patients with a multi-organ transplantation
    multi_organ_patients = set(
        tx_li.loc[tx_li["REC_TX_ORG_TY"] != b"LI"]["PX_ID"]
    )
    df = df[~df["PX_ID"].isin(multi_organ_patients)]
    print(f"Dropping {df_rows - df.shape[0]} rows (multi-organ transplant).")
    df_rows = len(tx_li[tx_li.PX_ID.isin(df.PX_ID.unique())].PX_ID.unique())
    print(f"after EC 4, {df_rows} unique patients")

    # 5. Remove patients multi-listed, refused transplant, transferred, unable 
    # to contact candidate, transplant in other country, removed in error.
    df = df.loc[(df["CAN_REM_CD"] != 14) &  # Multi-listed
                (df["CAN_REM_CD"] != 6) & # Refused transplant
                (df["CAN_REM_CD"] != 7) & # Transferred to another center
                (df["CAN_REM_CD"] != 24) & # Unable to contact candidate
                (df["CAN_REM_CD"] != 22) & # Transplant in another country
                (df["CAN_REM_CD"] != 16) # Candidate removed in error
                ]
    print(f"Dropping {df_rows - df.shape[0]} rows (multi-listed, refused transplant, transferred, unable to contact candidate, transplant in other country, removed in error).")
    df_rows = len(tx_li[tx_li.PX_ID.isin(df.PX_ID.unique())].PX_ID.unique())
    print(f"after EC 5, {df_rows} unique patients")

    # 6. Remove non-adult patients.
    # TODO - 18 is the cutoff, presumably?
    df = df.loc[df["CAN_AGE_IN_MONTHS_AT_LISTING"] >= 18*12]

    print(f"Dropping {df_rows - df.shape[0]} rows (non-adult patient).")
    df_rows = len(tx_li[tx_li.PX_ID.isin(df.PX_ID.unique())].PX_ID.unique())
    print(f"after EC 6, {df_rows} unique patients")

    print(f"Filtering complete. Filtered DataFrame contains {df.shape[0]} rows.")

    return df

In [9]:
df = filter_candidates(cand_liin, mpexcept, tx_li)

Commencing filtering. DataFrame contains 327212 rows.
EC 1: Dropping 99868 rows (outside the timeframe of interest).
EC 2: Dropping 0 rows (received transplants but never waitlisted, or waitlisted in error).
Dropping 15222 rows (previous transplant).
Dropping 9507 rows (multi-organ transplant).
Dropping 13481 rows (multi-listed, refused transplant, transferred, unable to contact candidate, transplant in other country, removed in error).
Dropping 12925 rows (non-adult patient).
Filtering complete. Filtered DataFrame contains 176209 rows.


In [23]:
df = filter_candidates_unique(cand_liin, mpexcept, tx_li)

begin with 183721 unique patients
Commencing filtering. DataFrame contains 327212 rows.
EC 1: Dropping -43623 rows (outside the timeframe of interest).
after EC 1, 126349 unique patients
EC 2: Dropping -100995 rows (received transplants but never waitlisted, or waitlisted in error).
after EC 2, 126349 unique patients
Dropping -85773 rows (previous transplant).
after EC 3, 117979 unique patients
Dropping -84636 rows (multi-organ transplant).
after EC 4, 108472 unique patients
Dropping -80662 rows (multi-listed, refused transplant, transferred, unable to contact candidate, transplant in other country, removed in error).
after EC 5, 108472 unique patients
Dropping -67737 rows (non-adult patient).
after EC 6, 100001 unique patients
Filtering complete. Filtered DataFrame contains 176209 rows.


In [27]:
tx_li.CAN_LISTING_DT.max()

Timestamp('2021-02-27 00:00:00')

In [None]:
# Construct training, validation, and test sets - these will be text files
# wherein each line comprises a single PX_ID. Upon loading the data, the table
# can then be filtered by training, valdiation, and test PX_IDs in order to 
# obtain the desired table.

# We perform a 70%-15%-15% train-val-test split.
train_splt, val_splt = 0.7, 0.85

np.random.seed(42)

patient_identifiers = np.array(df["PX_ID"])
np.random.shuffle(patient_identifiers)
train, val, test = np.split(patient_identifiers,
                [int(train_splt*len(patient_identifiers)), 
                int(val_splt*len(patient_identifiers))]
                )

with open("mas/data/data_splits/train_split.txt", "w") as f:
    f.write("\n".join(train.astype('str')))

with open("mas/data/data_splits/val_split.txt", "w") as f:
    f.write("\n".join(val.astype('str')))

with open("mas/data/data_splits/test_split.txt", "w") as f:
    f.write("\n".join(test.astype('str')))