# Opioid Addiction Project
## Notebook 07: Testing

This notebook tests the execution code with multiple inputs, looking for areas where the code either technically fails (won't run at all) or practically fails (produces nonsensical outputs).

### W210, Capstone
Summer 2019

Team:  Cameron Kennedy, Aditi Khullar, Rachel Kramer, Sharad Varadarajan

# 0. Load Libraries and Set Global Variables
This analysis is performed in the cells below.

In [5]:
#Import Required Libraries
import pandas as pd
import random

#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Custom data prep function used in both training and prediction 
import OpioidExecution as oe

#Set initial parameter(s)
pd.set_option('display.max_rows', 200)
pd.options.display.max_columns = 150
dataDir = './data/'

print('Pandas Version', pd.__version__)

Pandas Version 0.24.1


# 1. Generate Multiple Input Scenarios

## 1.1 Toy Example of Dictionary Parsing

Here we build a tiny example of looping through every option of dictionary values for multiple keys, while choosing random values for other keys.

In [None]:
#Build a test dictionary
testDict = {'A':[1,2,3,4,5],
            'B':['a','b','c'],
            'C':[6,7,8,9,10]
           }
testDict

In [None]:
'''Loop through test dict, select each value once, 
while picking random values for non selected keys.
'''
inputDict = dict()

for k in testDict:
    for item in testDict[k]:
        print(k, item, ' | ', end='')
        
        for k2 in testDict:
            if k2 == k:  #Intentionally pick the item in the loop
                inputDict[k2] = item
            else:  #Pick a random item
                inputDict[k2] = random.choice(testDict[k2])
                
        print(inputDict)

## 1.2 Apply Toy Example to Dictionary of Possible Values

In [None]:
#Create dictionary of possible values
allChoicesDict = {'NAME':['Test Scripter'],
                  'IRSEX':[1,2],
                  'EDUHIGHCAT':[1,2,3,4,5],
                  'AGE2':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],
                  'IRALCRC':[1,2,3,9],
                  'IRALCFY':[991,993,1,12,50,100,300],
                  'CABINGEVR':[991,2,1],
                  'IRALCAGE':[991,14,17,20,21,100],
                  'TXYRRECVD2':[0,1],
                  'TXEVRRCVD2':[0,1],
                  'IRCIGRC':[9,1,2,3,4],
                  #'CIGDLYMO':[91,2,1],
                  'CIGAGE':[991,994,999,10,13,15,17,18,19,20,22,25,30,40,50,99],
                  'TOBYR':[0,1],
                  'FUCIG18':[2,1],
                  'IRMJRC':[9,1,2,3],
                  'IRMJFY':[991,993,1,2,3,7,10,20,30,40,50,100,200,300,365],
                  'FUMJ18':[2,1],
                  'IRCOCRC':[9,1,2,3],
                  'IRCRKRC':[9,1,2,3],
                  'IRHERRC':[9,1,2,3],
                  'IRHALLUCREC':[9,1,2,3],
                  'IRLSDRC':[9,1,2,3],
                  'IRECSTMOREC':[9,1,2,3],
                  'IRINHALREC':[9,1,2,3],
                  'IRMETHAMREC':[9,1,2,3],
                  'ADDPREV':[1,2,94],
                  'ADDSCEV':[1,2,94,99],
                  'BOOKED':[1,2,94,97],
                 }

allChoicesDict

In [None]:
#Now generate all combinations for our full dictionary
inputDict = dict()
numTests = 0

for k in allChoicesDict:
    for item in allChoicesDict[k]:
        print(k, item, ' | ', end='')
        
        for k2 in allChoicesDict:
            if k2 == k:  #Intentionally pick the item in the loop
                inputDict[k2] = item
            else:  #Pick a random item
                inputDict[k2] = random.choice(allChoicesDict[k2])
                
        numTests+=1
        print(inputDict,'\n')
        
print('Total Test Cases:', numTests)

# 2. Run Tests

In [None]:
#Run tests!

verbose = 1

inputDict = dict()
dfTestResults = pd.DataFrame()
numTests = 0

for k in allChoicesDict:
    for item in allChoicesDict[k]:
        if verbose >= 1:
            print('TEST',numTests, ' | ', k, '=', item)
        
        for k2 in allChoicesDict:
            if k2 == k:  #Intentionally pick the item in the loop
                inputDict[k2] = item
            else:  #Pick a random item
                inputDict[k2] = random.choice(allChoicesDict[k2])
        
        #Run Test!
        predProb, predPercentile, predFI = oe.generateReport(inputDict)  #This one line "does all the work"
        
        #Rename dictionary keys in output (to prevent duplicates of input names)
        keyList = [k3 for k3 in predFI]
        for k3 in keyList:
            predFI[k3+'_Shap'] = predFI.pop(k3)
        
        #Convert list items back to nonlist items (it converts to a list in oe.generateReport)
        for k4 in inputDict:
            inputDict[k4] = inputDict[k4][0]
        
        #Print Output
        if verbose >= 2:
            print('Input:', inputDict)
            print('Predicted Probability: {:.3%}'.format(predProb))
            print('Percentile of Predicted Probability: {:.3%}'.format(predPercentile))
        if verbose >= 3:
            print('Feature Importance (sorted low to high):', predFI)
        
        outDict = {**{'TestCase':numTests,'predProb':predProb, 'predPercentile':predPercentile},
                   **inputDict, 
                   **predFI
                  }
        #{**dictA, **dictB, **dictC, etc.} is a handy (and pythonic!) way to merge dicts
        
        #Add test results to dataframe
        dfTestResults = dfTestResults.append(outDict, ignore_index=True)
        
        if verbose >= 4:
            print(dfTestResults)
        
        numTests+=1
        
print('Total Test Cases:', numTests)

In [None]:
#Convert data types (float to int)
for k in inputDict:
    if k != 'NAME':
        dfTestResults[k] = dfTestResults[k].astype(int)
        
dfTestResults

In [None]:
#Save test results to file
dfTestResults.to_csv(dataDir+'testResults.csv')

# 3. Explore Test Results

In [None]:
for k in inputDict:
    if k != 'NAME':
        plt.figure() #Createes a new figure
        print(k)
        ax = sns.boxplot(x=k, y=k+'_Shap', data=dfTestResults).set_title(k)
        ax = sns.swarmplot(x=k, y=k+'_Shap', data=dfTestResults, color='gray')
        ax.axhline(0, ls='-', color='black') #Horizontal line
        vals = ax.get_yticks()
        ax.set_yticklabels(['{:,.1%}'.format(x) for x in vals])
        plt.show()

# 4. Other Testing

In [8]:
#Test that we can open the friendly names dictionary
dictFriendlyVarNames = oe.getFriendlyVarNames()

#OLD METHOD
# with open(dataDir+'dictFriendlyVarNames.txt','r') as f:
#     dictFriendlyVarNames = eval(f.read())
    
dictFriendlyVarNames

{'IRSEX': 'Gender',
 'EDUHIGHCAT': 'Highest Education',
 'AGE2': 'Age',
 'IRALCRC': 'Alcohol Recency',
 'IRALCFY': 'Alcohol Frequency ',
 'CABINGEVR': 'Binge Drinking',
 'IRALCAGE': 'Age First Used Alcohol',
 'TXYRRECVD2': 'Alc/Drug Tx, Past Yr',
 'TXEVRRCVD2': 'Alc/Drug Tx, Lifetime',
 'IRCIGRC': 'Cigarette Recency',
 'CIGAGE': 'Daily Cigarette Use Age',
 'TOBYR': 'Tobacco Use, Past Year',
 'FUCIG18': 'Cigarettes Under 18',
 'IRMJRC': 'Marijuana Recency',
 'IRMJFY': 'Marijuana Frequency',
 'FUMJ18': 'Marijuana Under 18',
 'IRCOCRC': 'Cocaine Recency',
 'IRCRKRC': 'Crack Recency',
 'IRHERRC': 'Heroine Recency',
 'IRHALLUCREC': 'Hallucinogen Recency',
 'IRLSDRC': 'LSD Recency',
 'IRECSTMOREC': 'Ecstacy Recency',
 'IRINHALREC': 'Inhalant Recency',
 'IRMETHAMREC': 'Meth Recency',
 'ADDPREV': 'Multi-Day Depression',
 'ADDSCEV': 'Multi-Day Discouraged',
 'BOOKED': 'Arrested and Booked'}

# Appendix: Other Code, Function Testing, Etc.

In [None]:
#Testing creating dataframe from dictionary
a = {'predProb': 0.3271889400921659, 'predPercentile': 0.8793103448275862, 'NAME': 'Test Scripter'}
b = {'predProb': 0.3271889400921659, 'predPercentile': 0.8793103448275862, 'NAME': 'Test Scripter', 'IRSEX': 1, 'EDUHIGHCAT': 2, 'AGE2': 13, 'IRALCRC': 1, 'IRALCFY': 100, 'CABINGEVR': 1, 'IRALCAGE': 991, 'TXYRRECVD2': 1, 'TXEVRRCVD2': 1, 'IRCIGRC': 3, 'CIGDLYMO': 2, 'CIGAGE': 30, 'TOBYR': 1, 'FUCIG18': 2, 'IRMJRC': 9, 'IRMJFY': 1, 'FUMJ18': 2, 'IRCOCRC': 9, 'IRCRKRC': 2, 'IRHERRC': 3, 'IRHALLUCREC': 9, 'IRLSDRC': 1, 'IRECSTMOREC': 1, 'IRINHALREC': 3, 'IRMETHAMREC': 2, 'ADDPREV': 2, 'ADDSCEV': 2, 'BOOKED': 2, 'IRCIGRC_Shap': -0.07299654091300585, 'ADDSCEV_Shap': -0.06865909213385069, 'CIGAGE_Shap': -0.05574356218385848, 'IRCRKRC_Shap': -0.04934079270700638, 'IRLSDRC_Shap': -0.04646291598195351, 'IRMJRC_Shap': -0.0200648067591507, 'TOBYR_Shap': -0.015489018333968665, 'FUCIG18_Shap': -0.01138623521782741, 'IRCOCRC_Shap': -0.010061910431149564, 'FUMJ18_Shap': -0.009741800963022434, 'IRHALLUCREC_Shap': -0.008625047228143813, 'IRMJFY_Shap': -0.004902287641520897, 'CIGDLYMO_Shap': 0.0, 'TXEVRRCVD2_Shap': 0.0, 'BOOKED_Shap': 0.0024776782586973176, 'CABINGEVR_Shap': 0.00685966595368058, 'EDUHIGHCAT_Shap': 0.014366252833132368, 'IRALCRC_Shap': 0.014641349875946778, 'IRSEX_Shap': 0.015423509266007558, 'IRALCAGE_Shap': 0.017224785647449774, 'AGE2_Shap': 0.02299620739872297, 'ADDPREV_Shap': 0.028591442326717936, 'IRALCFY_Shap': 0.036158901226165434, 'IRINHALREC_Shap': 0.04385787814223628, 'IRECSTMOREC_Shap': 0.055529423394040836, 'IRHERRC_Shap': 0.07470339366460665, 'TXYRRECVD2_Shap': 0.07821903891985728, 'IRMETHAMREC_Shap': 0.12716643169159556}
c = {'predProb': 0.3271889400921659, 'predPercentile': 0.8793103448275862, 'NAME': 'Test Scripter', 'IRSEX': 1, 'EDUHIGHCAT': 2, 'AGE2': 13, 'IRALCRC': 1, 'IRALCFY': 100, 'CABINGEVR': 1, 'IRALCAGE': 991, 'TXYRRECVD2': 1, 'TXEVRRCVD2': 1, 'IRCIGRC': 3, 'CIGDLYMO': 2, 'CIGAGE': 30, 'TOBYR': 1, 'FUCIG18': 2, 'IRMJRC': 9, 'IRMJFY': 1, 'FUMJ18': 2, 'IRCOCRC': 9, 'IRCRKRC': 2, 'IRHERRC': 3, 'IRHALLUCREC': 9, 'IRLSDRC': 1, 'IRECSTMOREC': 1, 'IRINHALREC': 3, 'IRMETHAMREC': 2, 'ADDPREV': 2, 'ADDSCEV': 2, 'BOOKED': 2, 'IRCIGRC_Shap': -0.07299654091300585, 'ADDSCEV_Shap': -0.06865909213385069, 'CIGAGE_Shap': -0.05574356218385848, 'IRCRKRC_Shap': -0.04934079270700638, 'IRLSDRC_Shap': -0.04646291598195351, 'IRMJRC_Shap': -0.0200648067591507, 'TOBYR_Shap': -0.015489018333968665, 'FUCIG18_Shap': -0.01138623521782741, 'IRCOCRC_Shap': -0.010061910431149564, 'FUMJ18_Shap': -0.009741800963022434, 'IRHALLUCREC_Shap': -0.008625047228143813, 'IRMJFY_Shap': -0.004902287641520897, 'CIGDLYMO_Shap': 0.0, 'TXEVRRCVD2_Shap': 0.0, 'BOOKED_Shap': 0.0024776782586973176, 'CABINGEVR_Shap': 0.00685966595368058, 'EDUHIGHCAT_Shap': 0.014366252833132368, 'IRALCRC_Shap': 0.014641349875946778, 'IRSEX_Shap': 0.015423509266007558, 'IRALCAGE_Shap': 0.017224785647449774, 'AGE2_Shap': 0.02299620739872297, 'ADDPREV_Shap': 0.028591442326717936, 'IRALCFY_Shap': 0.036158901226165434, 'IRINHALREC_Shap': 0.04385787814223628, 'IRECSTMOREC_Shap': 0.055529423394040836, 'IRHERRC_Shap': 0.07470339366460665, 'TXYRRECVD2_Shap': 0.07821903891985728, 'IRMETHAMREC_Shap': 0.12716643169159556}

# for k in a:
#     a[k] = [k]

df = pd.DataFrame()
df = df.append(a, ignore_index=True)
df = df.append(b, ignore_index=True)
df

In [None]:
#This was used for early testing
limChoicesDict = {'NAME':['Test Scripter'],
                  'IRSEX':[1,],
                  'EDUHIGHCAT':[3],
                  'AGE2':[10],
                  'IRALCRC':[1],
                  'IRALCFY':[12],
                  'CABINGEVR':[2],
                  'IRALCAGE':[17],
                  'TXYRRECVD2':[1],
                  'TXEVRRCVD2':[0],
                  'IRCIGRC':[2],
                  'CIGDLYMO':[91],
                  'CIGAGE':[18],
                  'TOBYR':[0],
                  'FUCIG18':[1],
                  'IRMJRC':[3],
                  'IRMJFY':[40],
                  'FUMJ18':[2],
                  'IRCOCRC':[1],
                  'IRCRKRC':[1],
                  'IRHERRC':[1],
                  'IRHALLUCREC':[2],
                  'IRLSDRC':[2],
                  'IRECSTMOREC':[3],
                  'IRINHALREC':[3],
                  'IRMETHAMREC':[2],
                  'ADDPREV':[1],
                  'ADDSCEV':[1],
                  'BOOKED':[1],
                 }

limChoicesDict

In [None]:
#Test plot
ax = sns.boxplot(x="ADDPREV", y="ADDPREV_Shap", data=dfTestResults).set_title('ADDPREV')
ax = sns.swarmplot(x="ADDPREV", y="ADDPREV_Shap", data=dfTestResults, color=".25")