In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import os
import re
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.style as style
%matplotlib inline

In [2]:
THRESHOLD = 2.0

MAX_EXP_THRESHOLD = 3.0

OUTPUT_DIR = "output/%s-mm-threshold/" % THRESHOLD
if not os.path.exists("output/%s-mm-threshold/" % THRESHOLD):
    os.mkdir(OUTPUT_DIR)
    os.mkdir(os.path.join(OUTPUT_DIR, "histograms"))
    os.mkdir(os.path.join(OUTPUT_DIR, "boxplots"))
    os.mkdir(os.path.join(OUTPUT_DIR, "tables"))

ANTIMERES = ['13-23', '14-24', '15-25', '16-26', '17-27',\
            '33-43', '34-44', '35-45', '36-46', '37-47']
antimere_dict = {
    '13-23': 'Upper Canine',
    '14-24': 'Upper 1st Premolar',
    '15-25': 'Upper 2nd Premolar',
    '16-26': 'Upper 1st Molar',
    '17-27': 'Upper 2nd Molar',
    '33-43': 'Lower Canine',
    '34-44': 'Lower 1st Premolar',
    '35-45': 'Lower 2nd Premolar',
    '36-46': 'Lower 1st Molar',
    '37-47': 'Lower 2nd Molar'
}

MEASURE_TYPES = ['.cc', '.ctd', '.ct', '.gm']
measure_type_dict = {
    '.cc':'ClinCheck',
    '.ctd':'Centroid',
    '.ct':'Cusp Tip',
    '.gm': 'Gingival Margin'
}

In [3]:
df = pd.read_csv("master-spreadsheet.csv",index_col="Patient ID", skiprows=1)
# removing all cases with maximum predicted expansion below threshold
df = df[(df['Max(4,5,6) Mx Exp'] >= MAX_EXP_THRESHOLD) & (df['Include?'] == 'Y')]

In [4]:
df.to_csv('pt-contact-list-UNF.csv', columns=['Treatment Start Date', 'Aligner Material', 'Patient Name', 'Patient Age'])

In [5]:
def percentAccuracy(row, antimere, measure_type):
    '''Given a row with "xx-xxi", "xx-xxp", "xx-xxf" columns for
    initial, predicted, and final arch widths across antimeres
    "xx-xx" (e.g. 13-23), returns the % of predicted expansion achieved.
    
    Antimere must be in the form '1x-2x' or '3x-4x'
    
    These antimeres also have suffix measure_type that is
    .cc, .ctd, .ct, .gm depending on how they were measured
    (ClinCheck archwidth, centroid, cusp tip, gingival margin)
    
    Note that is technically NOT accuracy, since
    if f > p, then 'accuracy' will be >100%.
    It is better described as % of predicted expansion achieved.

    Returns None if predicted expansion = 0'''
    
    predWidth = row[antimere+'p'+measure_type]
    achWidth = row[antimere+'f'+measure_type]
    initWidth = row[antimere+'i'+measure_type]
    
    predExp =  predWidth - initWidth
    achExp =  achWidth - initWidth
    
    if 0 in [predExp, predWidth, achWidth, initWidth]:
        return None
    return 100*achExp/predExp

def predictedExp(row, antimere, measure_type):
    '''Returns predicted expansion for a given row, antimere, measure type'''
    predWidth = row[antimere+'p'+measure_type]
    initWidth = row[antimere+'i'+measure_type]
    if predWidth == 0 or initWidth == 0:
        return None
    # assert(predWidth != initWidth)
    return predWidth - initWidth

def achievedExp(row, antimere, measure_type):
    '''Returns predicted expansion for a given row, antimere, measure type'''
    achWidth = row[antimere+'f'+measure_type]
    initWidth = row[antimere+'i'+measure_type]
    if initWidth == 0 or achWidth == 0:
        return None
    # assert(predWidth != initWidth)
    return achWidth - initWidth

# def getColumn(df, column_name, antimere, threshold, measure_type, aligner_material):
#     '''Returns Series of specified parameters based on measure_type predicted width
#     NB: If no threshold desired, set threshold = False, NOT threshold = 0.'''
    
#     if aligner_material != False:
#         df = df[df['Aligner Material'] == aligner_material]
#     if type(threshold) == bool and threshold == False:
#         return df[column_name]
#     return df[df['pΔ%s%s' % (antimere, measure_type)] > threshold][column_name]

def getColumn(df, column_name, threshold, aligner_material, screen_by_cc=True):
    '''Returns Series of specified parameters based.
    
    Args:
        df: DataFrame of patients & arch width measurements
        column_name: Name of column requested from df
        threshold: Only returns values where pΔxx-xx.xx > threshold
            NB: For no screening by threshold, set False (cf. 0)
        aligner_material: either LD30 or EX30
        screen_by_cc: bool
            True: screens by ClinCheck predicted change in arch width,
                i.e. pΔxx-xx.cc > threshold (or .ctd for 2nd molars),
                regardless of measure type in column_name
            False: screens by measure type in column_name
    '''
    
    measure_type = re.search('\.\w+', column_name).group()
    antimere = re.search('\d\d-\d\d', column_name).group()
    
    if aligner_material != False:
        df = df[df['Aligner Material'] == aligner_material]
    if type(threshold) == bool and threshold == False:
        return df[column_name]
    if '7' in antimere:
        return df[df['pΔ%s.ctd' % (antimere)] > threshold][column_name]
    return df[df['pΔ%s.cc' % (antimere)] > threshold][column_name]

def inspectPtAntimere(df, ptId, antimere):
    '''Gives %acc for all measure types for given pt and antimere
    
    Args:
        df:
        ptId: int
        antimere: xx-xx
        measure_type: .xx
    Returns:
        Series of accuracies for different measure types
    '''
    print('ptId: %s' % ptId)
    for measure_type in MEASURE_TYPES:
        acc = df.loc[ptId]['%s%s %%acc' % (antimere, measure_type)]
        print('%s%s: %s' % (antimere, measure_type, acc))

def getSubDf(df, measure_type, threshold, aligner_material):
    '''Returns a df with specified measure_type, aligner_material, and
    above predicted expansion threshold only'''
    cols = []
    for antimere in ANTIMERES:
        if measure_type == '.cc' and '7' in antimere:
            continue
        col = getColumn(df, '%s%s %%acc' % (antimere, measure_type), threshold, aligner_material)    
        cols.append(col)
    return pd.concat(cols, axis=1)

def getAccuraciesOfPt(df, ptId):
    for col in df.columns:
        if '%acc' in col:
            print("%s\t%s" % (col, df.loc[ptId][col]))
    return

In [6]:
'''Data cleaning & processing; see comments.'''
for antimere in ANTIMERES:
    # Ensure all xx-xxipf columns are type float
    for measure_type in MEASURE_TYPES:
        if measure_type == '.cc' and '7' in antimere:
            continue

        df[antimere+'i'+measure_type].astype(float)
        df[antimere+'p'+measure_type].astype(float)
        df[antimere+'f'+measure_type].astype(float)

        # Create 'xx-xx %acc' column for each antimere
        full_antimere = antimere + measure_type
        df['%s %%acc' % full_antimere] = df.apply(lambda row: percentAccuracy(row, antimere, measure_type), axis=1)

        # Create 'pΔxx-xx.xx' & 'aΔxx-xx.xx' column for each antimere/measure type
        df['pΔ%s' % full_antimere] = df.apply(lambda row: predictedExp(row, antimere, measure_type), axis=1)
        df['aΔ%s' % full_antimere] = df.apply(lambda row: achievedExp(row, antimere, measure_type), axis=1)

LD30 = df[df['Aligner Material'] == 'LD30']
EX30 = df[df['Aligner Material'] == 'EX30']

In [21]:
getColumn(df, '14-24.ctd %acc', 2, 'LD30').sort_values(ascending=False)

Patient ID
3970955    151.283987
3120651    132.530227
4678988    114.468521
3932114    109.175601
3721347    107.394266
3229091    103.595359
3281726    100.000000
3616430     94.869182
4519394     92.858397
4266193     92.786825
2952792     92.279465
3835957     89.403387
3247055     89.349713
3307283     88.546541
3220134     85.785017
5384124     85.618518
5549476     85.548322
3170026     83.555409
3729943     82.979180
3182125     82.367065
4393227     82.279937
3280031     82.079122
4165653     80.474111
3519570     80.006257
6200368     79.898479
4233296     79.381023
3328760     78.894339
3897351     78.651032
4599498     78.444387
4401903     77.918875
4151217     76.815105
3976052     73.467162
2963992     73.108022
3452431     73.071125
3791559     72.269034
6240993     70.488695
3538481     70.300381
4533730     70.099000
4070409     69.251720
3223943     67.226615
3165818     64.694774
3916320     64.475761
3481023     62.182386
3558898     61.043009
3855283     60.819964