In [1]:
# Imports
import pandas as pd
import numpy as np
import json
import re

pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 400)

#### Functions

In [2]:
# Helper functions from helper_functions.py (couldn't import for some reason)

def set_nulls(data):
    """
   @param data: dataframe
   
   @return dataframe with -99 replaced with NaN
    """
    data.replace(to_replace = -99, value = np.nan)
            
    return data

def map_cpt(data, column, replace, name):
    """
    @param data: dataframe
    @param column: string, column name
    @param replace: list of variables holding the values to be replaced by that particular variable name
    @ param name: string or integer of what will replace the values in replacements

    """
    for r in replace:
        idx = np.where(data[column] == r)[0]
        data[column].loc[idx] = name
    
    return data

#### Data

In [3]:
data = pd.read_csv('../data/monet_output.csv')
data.drop(['Unnamed: 0', 'X'], axis = 1, inplace = True)
integer_cols = data.dtypes == int
int_cols = data.columns[integer_cols]
df = set_nulls(data)
float_flag = df.dtypes == float
float_cols = df.columns[float_flag]

op1 = ['COLCT TOT ABDL W/O PRCTECT W/CONTINENT ILEOST']
op2 = ['COLCT TOT ABDL W/O PRCTECT W/ILEOST/ILEOPXTS', 'LAPS COLECTOMY TOT W/O PRCTECT W/ILEOST/ILEOPXTS']
op3 = ['COLECTOMY PARTIAL W/ANASTOMOSIS', 'LAPAROSCOPY COLECTOMY PARTIAL W/ANASTOMOSIS']
op4 = ['COLECTOMY PRTL ABDOMINAL & TRANSANAL APPROACH', 'COLECTOMY PRTL ABDOMINAL & TRANSANAL APPR']
op5 = ['COLECTOMY PRTL W/COLOPROCTOSTOMY', 'LAPS COLECTOMY PRTL W/COLOPXTSTMY LW ANAST']
op6 = ['COLECTOMY PRTL W/COLOPROCTOSTOMY & COLOSTOMY', 'LAPS COLECTMY PRTL W/COLOPXTSTMY LW ANAST W/CLST']
op7 = ['COLECTOMY PRTL W/COLOST/ILEOST & MUCOFISTULA']
op8 = ['COLECTOMY PRTL W/END COLOSTOMY & CLSR DSTL SGMT', 'COLECTOMY PRTL W/END COLOSTOMY&CLSR DSTL SGMT', 'LAPS COLECTOMY PRTL W/END CLST & CLSR DSTL SGM', 'LAPS COLECTOMY PRTL W/END CLST&CLSR DSTL SGM']
op9 = ['COLECTOMY PRTL W/RMVL TERMINAL ILEUM & ILEOCOLOS', 'COLECTOMY PRTL W/RMVL TERMINAL ILEUM&ILEOCOLOST', 'LAPS COLECTOMY PRTL W/RMVL TERMINAL ILEUM', 'COLECTOMY PRTL W/RMVL TERMINAL ILEUM & ILEOCOLOST']
op10 = ['COLECTOMY PRTL W/SKIN LEVEL CECOST/COLOSTOMY']
num_replacements = [op1, op2, op3, op4, op5, op6, op7, op8, op9, op10]
for i in range(len(num_replacements)):
    df_clean = map_cpt(df, 'PRNCPTX', num_replacements[i], i+1)

MIS = ['Laparoscopic', 'Endoscopic w/ unplanned conversion to open', 'Hybrid', 'Hybrid w/ open assist', 'Laparoscopic Hand Assisted', 'Laparoscopic w/ open assist', 'Laparoscopic w/ unplanned conversion to open', 'Laparoscopic w/ unplanned conversion to Open', 'Other MIS approach', 'Robotic', 'Robotic w/ open assist', 'Robotic w/ unplanned conversion to open', 'SILS', 'SILS w/ open assist', 'SILS w/ unplanned conversion to open', 'Hybrid w/ unplanned conversion to open', 'Endoscopic w/ open assist', 'Other MIS approach w/ open assist', 'Endoscopic', 'NOTES', 'NOTES w/ open assist', 'Other MIS approach w/ unplanned conversion to open', 'NOTES w/ unplanned conversion to open']
Open = ['Open', 'Open (planned)']
options = [MIS, Open]
names = ['MIS', 'open']
for i in range(len(options)):
    df_clean = map_cpt(df_clean, 'COL_APPROACH', options[i], names[i])

#convert unknowns to NAs
nulls = np.where(df_clean.COL_APPROACH == 'Unknown')[0]
df_clean.COL_APPROACH.loc[nulls] = np.nan

unplanned = [c for c in df_clean if "UNPLANNEDREADMISSION" in c]
df_clean['num_unplanned'] = df_clean[unplanned].sum(axis=1)
df_clean['target'] = [1 if x>0 else 0 for x in df_clean['num_unplanned']]

othercpt = [c for c in df_clean if "OTHERCPT" in c]
df_clean['num_other_procs'] = df_clean[othercpt].count(axis=1)
concurrcpt = [c for c in df_clean if "CONCURR" in c]
df_clean['num_concurr_procs'] = df_clean[concurrcpt].count(axis=1)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Comparing procedure types

In [59]:
## Check procedures within OTHERPROC
procvec = ["OTHERPROC", "CONCURR"]

for proctype in procvec:
    proc = data.filter(like = proctype)
    proc.index = data.CASEID.astype('int')

    ## Get aggregate stats on OTHERPROC
    varnames = []
    unique_vals = []
    proportion = []

    all_procs = []

    for i in range(proc.shape[1]):
        var = proctype + str(i + 1)

        varnames.append(var)
        unique_vals.append(proc[var].nunique())
        proportion.append(1 - (proc[var].isna().sum()/len(proc[var])))

        all_procs = all_procs + proc[var].value_counts().index.tolist()

    all_procs = set(all_procs)
        
    proc_stats = pd.DataFrame({"unique_vals":unique_vals,
                               "prop_total_unique_proc":np.array(unique_vals)/len(all_procs),
                               "prop_not_na":proportion}, index = varnames)


    ## Confirm that no patient has other procedures out of order
     # i.e. no one has OTHERPROC2 without also having OTHERPROC1

    ## Number of greatest OTHERPROC
    proc_num = proc.isna()

    for col in proc_num.columns:
        proc_num[col] = np.where(proc_num[col] == False, int(re.findall("[0-9]+", col)[0]), 0)

    maxproc = proc_num.max(axis = 1)

    ## Number of procedures with non-null values
    numproc = (proc.isna()==False).sum(axis = 1)

    ## maxproc and numproc should be equal for each observation--they aren't


    print("-"*10 + "\n" + str(proctype) + "\n" + "-"*10)

    print(proc_stats)
    print("\nThe total number of unique procedures is", len(all_procs), "\n")

    print(len(proc[maxproc != numproc]), "observations have 'out of order' procedures\n\n")

----------
OTHERPROC
----------
             unique_vals  prop_total_unique_proc  prop_not_na
OTHERPROC1          1087                0.678104     0.505806
OTHERPROC2           970                0.605115     0.213805
OTHERPROC3           764                0.476606     0.080899
OTHERPROC4           595                0.371179     0.029502
OTHERPROC5           405                0.252651     0.010837
OTHERPROC6           271                0.169058     0.003982
OTHERPROC7           177                0.110418     0.001644
OTHERPROC8           129                0.080474     0.000768
OTHERPROC9            62                0.038677     0.000341
OTHERPROC10           32                0.019963     0.000171

The total number of unique procedures is 1603 

89 observations have 'out of order' procedures


----------
CONCURR
----------
           unique_vals  prop_total_unique_proc  prop_not_na
CONCURR1          1105                0.683364     0.139733
CONCURR2           776                

In [77]:
proc = pd.concat([data.filter(like = "OTHERPROC"), data.filter(like = "CONCURR")], axis=1).copy()
proc.index = data.CASEID.astype('int')

In [78]:
oprocs = []
for col in proc.filter(like = "OTHERPROC").columns:
    oprocs += proc[col].value_counts().index.tolist()

cprocs = []
for col in proc.filter(like = "CONCURR").columns:
    cprocs += proc[col].value_counts().index.tolist()

all_procs = []
for col in proc.columns:
    all_procs += proc[col].value_counts().index.tolist()

results = pd.DataFrame({"unique_other":len(set(oprocs)),
                        "unique_concurr":len(set(cprocs)),
                        "unique_both":len(set(all_procs))},
                       index=["Total"])

In [79]:
for i in range(10):
    oprocs = proc["OTHERPROC" + str(i+1)].value_counts().index.tolist()
    cprocs = proc["CONCURR" + str(i+1)].value_counts().index.tolist()
    
    all_procs = []
    all_procs += proc["OTHERPROC" + str(i+1)].value_counts().index.tolist()
    all_procs += proc["CONCURR" + str(i+1)].value_counts().index.tolist()
    
    toappend = pd.DataFrame({
         "unique_other":len(set(oprocs)),
         "unique_concurr":len(set(cprocs)),
         "unique_both":len(set(all_procs))
        },
        index=["PROC" + str(i+1)])
    
    results = pd.concat([results, toappend])

In [80]:
results["overlap"] = results[["unique_other", "unique_concurr"]].sum(axis = 1) - results["unique_both"]
results["prop_overlap"] = results.overlap / results.unique_both
results

Unnamed: 0,unique_other,unique_concurr,unique_both,overlap,prop_overlap
Total,1603,1617,2345,875,0.373134
PROC1,1087,1105,1614,578,0.358116
PROC2,970,776,1346,400,0.297177
PROC3,764,525,1007,282,0.28004
PROC4,595,267,722,140,0.193906
PROC5,405,147,450,102,0.226667
PROC6,271,84,310,45,0.145161
PROC7,177,43,196,24,0.122449
PROC8,129,22,141,10,0.070922
PROC9,62,10,64,8,0.125


In [92]:
proc["otherproc_cnt"] = (proc.filter(like= "OTHERPROC").isna() == False).sum(axis = 1)
proc["concurr_cnt"] = (proc.filter(like= "CONCURR").isna() == False).sum(axis = 1)
proc["yr"] = data.OPERYR.values

In [126]:
yr_vec = []
only_concurr = []
only_otherproc = []
both = []
neither = []

for year in np.sort(proc.yr.unique()):
    yr_vec.append(year)
    
    n = len(proc[proc.yr == year])
    
    only_concurr.append(len(proc[(proc.yr == year)&(proc.concurr_cnt != 0)&(proc.otherproc_cnt == 0)]) / n)
    only_otherproc.append(len(proc[(proc.yr == year)&(proc.otherproc_cnt != 0)&(proc.concurr_cnt == 0)]) / n)
    both.append(len(proc[(proc.yr == year)&(proc.concurr_cnt != 0)&(proc.otherproc_cnt != 0)]) / n)
    neither.append(len(proc[(proc.yr == year)&(proc.concurr_cnt == 0)&(proc.otherproc_cnt == 0)]) / n)

In [127]:
results = pd.DataFrame({
    "only_concurr":only_concurr,
    "only_otherproc":only_otherproc,
    "both":both,
    "neither":neither
    }, index = yr_vec)

In [130]:
results

Unnamed: 0,only_concurr,only_otherproc,both,neither
2012,0.041635,0.43213,0.088923,0.437312
2013,0.043106,0.422692,0.098628,0.435573
2014,0.039862,0.422849,0.099042,0.438247
2015,0.044495,0.408886,0.105727,0.440892
2016,0.043556,0.403336,0.100145,0.452963
2017,0.043237,0.411101,0.102467,0.443195
2018,0.038854,0.401948,0.091698,0.467501
2019,0.041328,0.391207,0.094591,0.472873


## PODIAG10

In [146]:
print("PODIAG10 recorded for",
    round((data.PODIAG10.isna() == False).sum() / len(data), 4) * 100,
    "% of the data")

PODIAG10 recorded for 65.7 % of the data


In [148]:
vc = data.PODIAG10.value_counts()

In [151]:
df = pd.DataFrame({"cnt":vc.values}, index = vc.index)
df["prop"] = df.cnt / len(data)

In [159]:
print("5% cutoff: top", len(df[df.prop >= 0.05]))
print("3% cutoff: top", len(df[df.prop >= 0.03]))
print("1% cutoff: top", len(df[df.prop >= 0.01]))

print("Top 20 % cutoff:", df.head(20).tail(1).prop.values[0])

5% cutoff: top 2
3% cutoff: top 5
1% cutoff: top 14
Top 20 % cutoff: 0.006409137965127776


In [154]:
df.head(21)

Unnamed: 0,cnt,prop
K57.20,14585,0.05655
K57.32,14548,0.056407
C18.2,11888,0.046093
C18.7,11239,0.043577
C20,9168,0.035547
C18.0,7250,0.02811
C19,4921,0.01908
C18.4,4868,0.018875
C18.9,4585,0.017777
K56.2,4330,0.016789
