In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
%matplotlib inline

In [32]:
MAX_EXP_THRESHOLD = 3.0
HISTOGRAM_THRESHOLD = 2.0

ANTIMERES = ['13-23', '14-24', '15-25', '16-26', '17-27',\
            '33-43', '34-44', '35-45', '36-46', '37-47']

MEASURE_TYPES = ['.cc', '.ctd', '.ct', '.gm']
measure_type_dict = {
    '.cc':'ClinCheck',
    '.ctd':'Centroid',
    '.ct':'Cusp Tip',
    '.gm': 'Gingival Margin'
}

In [None]:
df = pd.read_csv("cases.csv",index_col="Patient ID", skiprows=1)

# removing all cases with maximum predicted expansion below threshold
df = df[(df['Max(4,5,6) Mx Exp'] >= MAX_EXP_THRESHOLD) & (df['Include?'] == 'Y')]

In [3]:
def percentAccuracy(row, antimere, measure_type):
    '''Given a row with "xx-xxi", "xx-xxp", "xx-xxf" columns for
    initial, predicted, and final arch widths across antimeres
    "xx-xx" (e.g. 13-23), returns the % of predicted expansion achieved.
    
    Antimere must be in the form '1x-2x' or '3x-4x'
    
    These antimeres also have suffix measure_type that is
    .cc, .ctd, .ct, .gm depending on how they were measured
    (ClinCheck archwidth, centroid, cusp tip, gingival margin)
    
    Note that is technically NOT accuracy, since
    if f > p, then 'accuracy' will be >100%.
    It is better described as % of predicted expansion achieved.

    Returns None if predicted expansion = 0'''
    
    predWidth = row[antimere+'p'+measure_type]
    achWidth = row[antimere+'f'+measure_type]
    initWidth = row[antimere+'i'+measure_type]
    
    predExp =  predWidth - initWidth
    achExp =  achWidth - initWidth
    
    if 0 in [predExp, predWidth, achWidth, initWidth]:
        return None
    return 100*achExp/predExp

def predictedExp(row, antimere, measure_type):
    '''Returns predicted expansion for a given row, antimere, measure type'''
    predWidth = row[antimere+'p'+measure_type]
    initWidth = row[antimere+'i'+measure_type]
    if predWidth == 0 or initWidth == 0:
        return None
    # assert(predWidth != initWidth)
    return predWidth - initWidth

def achievedExp(row, antimere, measure_type):
    '''Returns predicted expansion for a given row, antimere, measure type'''
    achWidth = row[antimere+'f'+measure_type]
    initWidth = row[antimere+'i'+measure_type]
    if initWidth == 0 or achWidth == 0:
        return None
    # assert(predWidth != initWidth)
    return achWidth - initWidth

def getColumn(df, column_name, antimere, threshold, measure_type, aligner_material):
    '''Returns Series of specified parameters.
    NB: If no threshold desired, set threshold = False, NOT threshold = 0.'''
    
    if aligner_material != False:
        df = df[df['Aligner Material'] == aligner_material]
    if type(threshold) == bool and threshold == False:
        return df[column_name]
    return df[df['pΔ%s%s' % (antimere, measure_type)] > threshold][column_name]

def getSubDf(df, measure_type, threshold, aligner_material):
    '''Returns a df with specified measure_type, aligner_material, and
    above predicted expansion threshold only'''
    cols = []
    for antimere in ANTIMERES:
        if measure_type == '.cc' and '7' in antimere:
            continue
        col = getColumn(df, '%s%s %%acc' % (antimere, measure_type), antimere, threshold, measure_type, aligner_material)    
        cols.append(col)
    return pd.concat(cols, axis=1)

In [4]:
'''Data cleaning & processing; see comments.'''
for antimere in ANTIMERES:
    # Ensure all xx-xxipf columns are type float
    for measure_type in MEASURE_TYPES:
        if measure_type == '.cc' and '7' in antimere:
            continue

        df[antimere+'i'+measure_type].astype(float)
        df[antimere+'p'+measure_type].astype(float)
        df[antimere+'f'+measure_type].astype(float)

        # Create 'xx-xx %acc' column for each antimere
        full_antimere = antimere + measure_type
        df['%s %%acc' % full_antimere] = df.apply(lambda row: percentAccuracy(row, antimere, measure_type), axis=1)

        # Create 'pΔxx-xx.xx' & 'aΔxx-xx.xx' column for each antimere/measure type
        df['pΔ%s' % full_antimere] = df.apply(lambda row: predictedExp(row, antimere, measure_type), axis=1)
        df['aΔ%s' % full_antimere] = df.apply(lambda row: achievedExp(row, antimere, measure_type), axis=1)

LD30 = df[df['Aligner Material'] == 'LD30']
EX30 = df[df['Aligner Material'] == 'EX30']

In [17]:
getColumn(df, '35-45.gm %acc', '35-45', 2, '.gm', 'LD30')

Patient ID
2952792    110.182768
2963992     68.064516
2987641    142.857143
3006227    101.893939
3120651    120.474777
3165818     29.583333
3182125     96.466431
3220134     94.140127
3223943    107.037037
3307283    101.587302
3328760     85.642317
3452431     95.165394
3481023     61.464968
3519570    127.572016
3538481    107.331378
3558898     71.521739
3721347    107.077626
3729943     80.677966
3791559     70.545455
3855283     73.411765
3897351    103.017241
3916320     55.290102
3970955    136.633663
3976052     81.385281
4027018    113.718412
4143745     93.871866
4151217     96.694215
4233296     78.750000
4393227     67.838313
4401903     87.500000
4519394     96.296296
Name: 35-45.gm %acc, dtype: float64