In [88]:
# Imports
import pandas as pd
import numpy as np
import json
import re

pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 400)

#### Functions

In [4]:
# Helper functions from helper_functions.py (couldn't import for some reason)

def set_nulls(data):
    """
   @param data: dataframe
   
   @return dataframe with -99 replaced with NaN
    """
    data.replace(to_replace = -99, value = np.nan)
            
    return data

def map_cpt(data, column, replace, name):
    """
    @param data: dataframe
    @param column: string, column name
    @param replace: list of variables holding the values to be replaced by that particular variable name
    @ param name: string or integer of what will replace the values in replacements

    """
    for r in replace:
        idx = np.where(data[column] == r)[0]
        data[column].loc[idx] = name
    
    return data

#### Data

In [5]:
data = pd.read_csv('../data/monet_output.csv')
data.drop(['Unnamed: 0', 'X'], axis = 1, inplace = True)
integer_cols = data.dtypes == int
int_cols = data.columns[integer_cols]
df = set_nulls(data)
float_flag = df.dtypes == float
float_cols = df.columns[float_flag]

op1 = ['COLCT TOT ABDL W/O PRCTECT W/CONTINENT ILEOST']
op2 = ['COLCT TOT ABDL W/O PRCTECT W/ILEOST/ILEOPXTS', 'LAPS COLECTOMY TOT W/O PRCTECT W/ILEOST/ILEOPXTS']
op3 = ['COLECTOMY PARTIAL W/ANASTOMOSIS', 'LAPAROSCOPY COLECTOMY PARTIAL W/ANASTOMOSIS']
op4 = ['COLECTOMY PRTL ABDOMINAL & TRANSANAL APPROACH', 'COLECTOMY PRTL ABDOMINAL & TRANSANAL APPR']
op5 = ['COLECTOMY PRTL W/COLOPROCTOSTOMY', 'LAPS COLECTOMY PRTL W/COLOPXTSTMY LW ANAST']
op6 = ['COLECTOMY PRTL W/COLOPROCTOSTOMY & COLOSTOMY', 'LAPS COLECTMY PRTL W/COLOPXTSTMY LW ANAST W/CLST']
op7 = ['COLECTOMY PRTL W/COLOST/ILEOST & MUCOFISTULA']
op8 = ['COLECTOMY PRTL W/END COLOSTOMY & CLSR DSTL SGMT', 'COLECTOMY PRTL W/END COLOSTOMY&CLSR DSTL SGMT', 'LAPS COLECTOMY PRTL W/END CLST & CLSR DSTL SGM', 'LAPS COLECTOMY PRTL W/END CLST&CLSR DSTL SGM']
op9 = ['COLECTOMY PRTL W/RMVL TERMINAL ILEUM & ILEOCOLOS', 'COLECTOMY PRTL W/RMVL TERMINAL ILEUM&ILEOCOLOST', 'LAPS COLECTOMY PRTL W/RMVL TERMINAL ILEUM', 'COLECTOMY PRTL W/RMVL TERMINAL ILEUM & ILEOCOLOST']
op10 = ['COLECTOMY PRTL W/SKIN LEVEL CECOST/COLOSTOMY']
num_replacements = [op1, op2, op3, op4, op5, op6, op7, op8, op9, op10]
for i in range(len(num_replacements)):
    df_clean = map_cpt(df, 'PRNCPTX', num_replacements[i], i+1)

MIS = ['Laparoscopic', 'Endoscopic w/ unplanned conversion to open', 'Hybrid', 'Hybrid w/ open assist', 'Laparoscopic Hand Assisted', 'Laparoscopic w/ open assist', 'Laparoscopic w/ unplanned conversion to open', 'Laparoscopic w/ unplanned conversion to Open', 'Other MIS approach', 'Robotic', 'Robotic w/ open assist', 'Robotic w/ unplanned conversion to open', 'SILS', 'SILS w/ open assist', 'SILS w/ unplanned conversion to open', 'Hybrid w/ unplanned conversion to open', 'Endoscopic w/ open assist', 'Other MIS approach w/ open assist', 'Endoscopic', 'NOTES', 'NOTES w/ open assist', 'Other MIS approach w/ unplanned conversion to open', 'NOTES w/ unplanned conversion to open']
Open = ['Open', 'Open (planned)']
options = [MIS, Open]
names = ['MIS', 'open']
for i in range(len(options)):
    df_clean = map_cpt(df_clean, 'COL_APPROACH', options[i], names[i])

#convert unknowns to NAs
nulls = np.where(df_clean.COL_APPROACH == 'Unknown')[0]
df_clean.COL_APPROACH.loc[nulls] = np.nan

unplanned = [c for c in df_clean if "UNPLANNEDREADMISSION" in c]
df_clean['num_unplanned'] = df_clean[unplanned].sum(axis=1)
df_clean['target'] = [1 if x>0 else 0 for x in df_clean['num_unplanned']]

othercpt = [c for c in df_clean if "OTHERCPT" in c]
df_clean['num_other_procs'] = df_clean[othercpt].count(axis=1)
concurrcpt = [c for c in df_clean if "CONCURR" in c]
df_clean['num_concurr_procs'] = df_clean[concurrcpt].count(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Comparing procedure types

In [217]:
## Check procedures within OTHERPROC
procvec = ["OTHERPROC", "CONCURR"]

for proctype in procvec:
    proc = data.filter(like = proctype)
    proc.index = data.CASEID.astype('int')

    ## Get aggregate stats on OTHERPROC
    varnames = []
    unique_vals = []
    proportion = []

    all_procs = []

    for i in range(proc.shape[1]):
        var = proctype + str(i + 1)

        varnames.append(var)
        unique_vals.append(proc[var].nunique())
        proportion.append(1 - (proc[var].isna().sum()/len(proc[var])))

        all_procs = all_procs + proc[var].value_counts().index.tolist()

    proc_stats = pd.DataFrame({"unique_vals":unique_vals, "proportion":proportion}, index = varnames)

    all_procs = set(all_procs)

    ## Confirm that no patient has other procedures out of order
     # i.e. no one has OTHERPROC2 without also having OTHERPROC1

    ## Number of greatest OTHERPROC
    proc_num = proc.isna()

    for col in proc_num.columns:
        proc_num[col] = np.where(proc_num[col] == False, int(re.findall("[0-9]+", col)[0]), 0)

    maxproc = proc_num.max(axis = 1)

    ## Number of procedures with non-null values
    numproc = (proc.isna()==False).sum(axis = 1)

    ## maxproc and numproc should be equal for each observation--they aren't


    print("-"*10 + "\n" + str(proctype) + "\n" + "-"*10)

    print(proc_stats)
    print("\nThe total number of unique procedures is", len(all_procs), "\n")

    print(len(proc[maxproc != numproc]), "observations have 'out of order' procedures\n\n")

----------
OTHERPROC
----------
             unique_vals  proportion
OTHERPROC1          1087    0.505806
OTHERPROC2           970    0.213805
OTHERPROC3           764    0.080899
OTHERPROC4           595    0.029502
OTHERPROC5           405    0.010837
OTHERPROC6           271    0.003982
OTHERPROC7           177    0.001644
OTHERPROC8           129    0.000768
OTHERPROC9            62    0.000341
OTHERPROC10           32    0.000171

The total number of unique procedures is 1603 

89 observations have 'out of order' procedures


----------
CONCURR
----------
           unique_vals  proportion
CONCURR1          1105    0.139733
CONCURR2           776    0.039397
CONCURR3           525    0.009654
CONCURR4           267    0.003133
CONCURR5           147    0.001198
CONCURR6            84    0.000492
CONCURR7            43    0.000229
CONCURR8            22    0.000124
CONCURR9            10    0.000066
CONCURR10            7    0.000050

The total number of unique procedures is 1617 
