# Opioid Addiction Project
## Notebook 06: Full Pipeline

This notebook is intended to simulate the full pipeline using .py modules, starting with submitting a user's input, preprocessing that input, then running it through prediction tasks, resulting in the final output of the user's prediction scores.

### W210, Capstone
Summer 2019

Team:  Cameron Kennedy, Aditi Khullar, Rachel Kramer, Sharad Varadarajan

# 0. Load Libraries and Set Global Variables
This analysis is performed in the cells below.

In [1]:
#Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.externals import joblib  #Used to save/load (pickle) models
from collections import defaultdict
import operator
from scipy import stats

#Custom data prep function used in both training and prediction 
import OpioidDataPrep as odp
import OpioidExecution as oe

#Set initial parameter(s)
pd.set_option('display.max_rows', 200)
pd.options.display.max_columns = 150
dataDir = './data/'

print('Pandas Version', pd.__version__)

Pandas Version 0.24.2




# 1. Simulate User Data

This section simulates the user entering various responses to questions. In the actual web tool, this input will come in the form of a dictionary, therefore this data should mimic that format.

In [2]:
#Simulate User Input

inputDict = dict()

'''
[{'NAME': 'sharad', 'AGE2': 27, 'IRSEX': 1, 'IREDUHIGHST2': 11}, {'IRALCAGE': 13, 'IRALCRC': 1, 'IRALCFY': 300, 
'BNGDRKMON': 1, 'HVYDRKMON': 1}, {'TXYRRECVD2': 1, 'TXEVRRCVD2': 1}, {'IRCIGRC': 1, 'CIGDLYMO': 1, 'CIGAGE': 13, 
'PIPEVER': 1, 'IRCGRRC': 1, 'IRSMKLSSREC': 1}, {'IRMJRC': 1, 'MJYRTOT': 300, 'FUMJ18': 1, 'FUMJ21': 1}, 
{'ADDPREV': 1, 'ADDSCEV': 1}, {'BOOKED': 1}]
'''

#DEMOGRAPHICS
inputDict['NAME'] = 'Joe Capstone' #We will delete this column
inputDict['IRSEX'] = 1 #Gender: 'Male' or 'Female'
inputDict['EDUHIGHCAT'] = 1 #Education:
inputDict['AGE2'] = 10 #Age: Remember, don't enter an age, but an age code from the codebook
    
#ALCOHOL
inputDict['IRALCRC'] = 9 #(Alcohol Recency)
inputDict['IRALCFY'] = 30 #(Alcohol Frequency Past Year)
inputDict['CABINGEVR'] = 2 #(Ever binge drank)
inputDict['IRALCAGE'] = 21 #(First time used alcohol)

#DRUGS + ALCOHOL
inputDict['TXYRRECVD2'] = 0 #(Ever alcohol/drug treatment, past yr)
inputDict['TXEVRRCVD2'] = 0 #(Ever alcohol/drug treatment, lifetime)


#TOBACCO
inputDict['IRCIGRC'] = 1 #(Tobacco Recency, incl. Never)
inputDict['CIGDLYMO'] = 1 #(Tobacco 30+ consecutive days)
inputDict['CIGAGE'] = 13 #(Tobacco Use Daily)
inputDict['TOBYR'] = 0 #(Used any tobacco product in past year, cigar, cigarette, etc.)
inputDict['FUCIG18'] = 1 #(Used cigarettes before 18)
    
#WEED
inputDict['IRMJRC'] = 1 #(Weed recency)
inputDict['IRMJFY'] = 2 #(Weed days in past year)
inputDict['FUMJ18'] = 1 #(First used weed prior to age 18)

#HARD DRUGS
inputDict['IRCOCRC'] = 9 #(Cocaine Recency)
inputDict['IRCRKRC'] = 9 #(Crack Recency)
inputDict['IRHERRC'] = 9 #(Heronie Recency)
inputDict['IRHALLUCREC'] = 9 #(Hallucinogen Recency)
inputDict['IRLSDRC'] = 9 #(LSD Recency)
inputDict['IRECSTMOREC'] = 9 #(Ecstacy Recency)
inputDict['IRINHALREC'] = 9 #(Inhalant Recency)
inputDict['IRMETHAMREC'] = 9 #(Meth Recency)

#DEPRESSION
inputDict['ADDPREV'] = 1 #(Several days of depression)
inputDict['ADDSCEV'] = 1 #(Several days of discouraged about life)
    
##OTHER
inputDict['BOOKED'] = 1 #(Ever arrested & booked)

print(inputDict)

#Convert to dataframe

{'NAME': 'Joe Capstone', 'IRSEX': 1, 'EDUHIGHCAT': 1, 'AGE2': 10, 'IRALCRC': 9, 'IRALCFY': 30, 'CABINGEVR': 2, 'IRALCAGE': 21, 'TXYRRECVD2': 0, 'TXEVRRCVD2': 0, 'IRCIGRC': 1, 'CIGDLYMO': 1, 'CIGAGE': 13, 'TOBYR': 0, 'FUCIG18': 1, 'IRMJRC': 1, 'IRMJFY': 2, 'FUMJ18': 1, 'IRCOCRC': 9, 'IRCRKRC': 9, 'IRHERRC': 9, 'IRHALLUCREC': 9, 'IRLSDRC': 9, 'IRECSTMOREC': 9, 'IRINHALREC': 9, 'IRMETHAMREC': 9, 'ADDPREV': 1, 'ADDSCEV': 1, 'BOOKED': 1}


# 1.b Web App Test

In [3]:
#Web App Test
runWebAppTest = True
predFI = None
if runWebAppTest:
    predProb, predPercentile, predFI = oe.generateReport(inputDict)  #This one line "does all the work"
    print('Predicted Probability: {:.3%}'.format(predProb))
    print('Percentile of Predicted Probability: {:.3%}'.format(predPercentile))
    print('Feature Importance (sorted low to high):')
predFI

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

  "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \
  "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \



Predicted Probability: 6.557%
Percentile of Predicted Probability: 26.924%
Feature Importance (sorted low to high):


{'ADDSCEV': -0.06609210091666896,
 'CIGAGE': -0.050404460773610216,
 'ADDPREV': -0.04036511120350936,
 'IRALCAGE': -0.04023273111175212,
 'CABINGEVR': -0.019380027545084208,
 'IRALCRC': -0.014202689546342073,
 'IRCOCRC': -0.008401323957656942,
 'IRMJFY': -0.007103847323714156,
 'IRCIGRC': -0.006596646223890737,
 'IRINHALREC': -0.006410258836447837,
 'IRHALLUCREC': -0.0061671297691400576,
 'BOOKED': -0.0036676635643996813,
 'IRECSTMOREC': -0.0020134739617030106,
 'IRHERRC': -0.001665827403587769,
 'IRMETHAMREC': -0.0012118191414676252,
 'IRCRKRC': 0.0,
 'TXEVRRCVD2': 0.0,
 'TXYRRECVD2': 0.0,
 'IRLSDRC': 0.0022588163441389503,
 'TOBYR': 0.005698609635221924,
 'FUMJ18': 0.0058140181124533355,
 'IRALCFY': 0.008204914271519578,
 'FUCIG18': 0.008330751534654957,
 'IRSEX': 0.008673005308168877,
 'IRMJRC': 0.015587193404258133,
 'AGE2': 0.02881319523126264,
 'EDUHIGHCAT': 0.03143855636371085,
 'CIGDLYMO': 0.04215656044555542}

# 2. Preprocess

In [4]:
#Convert inputs to list (pandas conversion to dataframe requires dict values to be lists)
if not runWebAppTest: 
    '''If we run our web app test, these next two lines already run in that and thus
    can't be run here (they'll double-list the dictionary)
    '''
    for k in inputDict:
        inputDict[k] = [inputDict[k]]
print(inputDict)

#Convert dict to dataframe
df = pd.DataFrame.from_dict(inputDict)

#Run preprocessing on dataframe
df = odp.preprocess(df)
df

{'NAME': ['Joe Capstone'], 'IRSEX': [1], 'EDUHIGHCAT': [1], 'AGE2': [10], 'IRALCRC': [9], 'IRALCFY': [30], 'CABINGEVR': [2], 'IRALCAGE': [21], 'TXYRRECVD2': [0], 'TXEVRRCVD2': [0], 'IRCIGRC': [1], 'CIGDLYMO': [1], 'CIGAGE': [13], 'TOBYR': [0], 'FUCIG18': [1], 'IRMJRC': [1], 'IRMJFY': [2], 'FUMJ18': [1], 'IRCOCRC': [9], 'IRCRKRC': [9], 'IRHERRC': [9], 'IRHALLUCREC': [9], 'IRLSDRC': [9], 'IRECSTMOREC': [9], 'IRINHALREC': [9], 'IRMETHAMREC': [9], 'ADDPREV': [1], 'ADDSCEV': [1], 'BOOKED': [1]}


Unnamed: 0,IRSEX,AGE2,TXYRRECVD2,TXEVRRCVD2,CIGDLYMO,TOBYR,FUCIG18,FUMJ18,IRCIGRC__GT1LTET2,IRCIGRC__GT2LTET3,IRCIGRC__GT3LTET4,IRCIGRC__GT4LTET9,CIGAGE__GT10LTET13,CIGAGE__GT13LTET15,CIGAGE__GT15LTET17,CIGAGE__GT17LTET18,CIGAGE__GT18LTET19,CIGAGE__GT19LTET20,CIGAGE__GT20LTET22,CIGAGE__GT22LTET25,CIGAGE__GT25LTET30,CIGAGE__GT30LTET40,CIGAGE__GT40LTET50,CIGAGE__GT50LTET99,CIGAGE__GT99LTET985,CIGAGE__GT985LTET991,CIGAGE__GT991LTET994,CIGAGE__GT994LTET997,CIGAGE__GT997LTET998,CIGAGE__GT998LTET999,IRMJRC__GT1LTET2,IRMJRC__GT2LTET3,IRMJRC__GT3LTET9,IRMJFY__GT1LTET2,IRMJFY__GT2LTET3,IRMJFY__GT3LTET7,IRMJFY__GT7LTET10,IRMJFY__GT10LTET20,IRMJFY__GT20LTET30,IRMJFY__GT30LTET40,IRMJFY__GT40LTET50,IRMJFY__GT50LTET100,IRMJFY__GT100LTET200,IRMJFY__GT200LTET300,IRMJFY__GT300LTET365,IRMJFY__GT365LTET985,IRMJFY__GT985LTET991,IRMJFY__GT991LTET993,IRCOCRC__GT1LTET2,IRCOCRC__GT2LTET3,IRCOCRC__GT3LTET9,IRCRKRC__GT1LTET2,IRCRKRC__GT2LTET3,IRCRKRC__GT3LTET9,IRHERRC__GT1LTET2,IRHERRC__GT2LTET3,IRHERRC__GT3LTET9,IRHALLUCREC__GT1LTET2,IRHALLUCREC__GT2LTET3,IRHALLUCREC__GT3LTET9,IRLSDRC__GT1LTET2,IRLSDRC__GT2LTET3,IRLSDRC__GT3LTET9,IRECSTMOREC__GT1LTET2,IRECSTMOREC__GT2LTET3,IRECSTMOREC__GT3LTET9,IRINHALREC__GT1LTET2,IRINHALREC__GT2LTET3,IRINHALREC__GT3LTET9,IRMETHAMREC__GT1LTET2,IRMETHAMREC__GT2LTET3,IRMETHAMREC__GT3LTET9,IRALCRC__GT1LTET2,IRALCRC__GT2LTET3,IRALCRC__GT3LTET9,IRALCFY__GT11LTET49,IRALCFY__GT49LTET99,IRALCFY__GT99LTET299,IRALCFY__GT299LTET365,IRALCFY__GT365LTET991,IRALCFY__GT991LTET993,CABINGEVR__GT1LTET2,CABINGEVR__GT2LTET85,CABINGEVR__GT85LTET91,CABINGEVR__GT91LTET94,CABINGEVR__GT94LTET97,CABINGEVR__GT97LTET98,IRALCAGE__GT14LTET17,IRALCAGE__GT17LTET20,IRALCAGE__GT20LTET21,IRALCAGE__GT21LTET100,IRALCAGE__GT100LTET991,ADDPREV__GT1LTET2,ADDPREV__GT2LTET85,ADDPREV__GT85LTET94,ADDPREV__GT94LTET97,ADDPREV__GT97LTET98,ADDSCEV__GT1LTET2,ADDSCEV__GT2LTET94,ADDSCEV__GT94LTET97,ADDSCEV__GT97LTET98,ADDSCEV__GT98LTET99,EDUHIGHCAT__GT1LTET2,EDUHIGHCAT__GT2LTET3,EDUHIGHCAT__GT3LTET4,EDUHIGHCAT__GT4LTET5,BOOKED__GT1LTET2,BOOKED__GT2LTET3,BOOKED__GT3LTET85,BOOKED__GT85LTET94,BOOKED__GT94LTET97,BOOKED__GT97LTET98
0,1,-0.821941,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
#Resort by column name (necessary to feed the model)
df = odp.sortDFbyColName(df)
df

Unnamed: 0,ADDPREV__GT1LTET2,ADDPREV__GT2LTET85,ADDPREV__GT85LTET94,ADDPREV__GT94LTET97,ADDPREV__GT97LTET98,ADDSCEV__GT1LTET2,ADDSCEV__GT2LTET94,ADDSCEV__GT94LTET97,ADDSCEV__GT97LTET98,ADDSCEV__GT98LTET99,AGE2,BOOKED__GT1LTET2,BOOKED__GT2LTET3,BOOKED__GT3LTET85,BOOKED__GT85LTET94,BOOKED__GT94LTET97,BOOKED__GT97LTET98,CABINGEVR__GT1LTET2,CABINGEVR__GT2LTET85,CABINGEVR__GT85LTET91,CABINGEVR__GT91LTET94,CABINGEVR__GT94LTET97,CABINGEVR__GT97LTET98,CIGAGE__GT10LTET13,CIGAGE__GT13LTET15,CIGAGE__GT15LTET17,CIGAGE__GT17LTET18,CIGAGE__GT18LTET19,CIGAGE__GT19LTET20,CIGAGE__GT20LTET22,CIGAGE__GT22LTET25,CIGAGE__GT25LTET30,CIGAGE__GT30LTET40,CIGAGE__GT40LTET50,CIGAGE__GT50LTET99,CIGAGE__GT985LTET991,CIGAGE__GT991LTET994,CIGAGE__GT994LTET997,CIGAGE__GT997LTET998,CIGAGE__GT998LTET999,CIGAGE__GT99LTET985,CIGDLYMO,EDUHIGHCAT__GT1LTET2,EDUHIGHCAT__GT2LTET3,EDUHIGHCAT__GT3LTET4,EDUHIGHCAT__GT4LTET5,FUCIG18,FUMJ18,IRALCAGE__GT100LTET991,IRALCAGE__GT14LTET17,IRALCAGE__GT17LTET20,IRALCAGE__GT20LTET21,IRALCAGE__GT21LTET100,IRALCFY__GT11LTET49,IRALCFY__GT299LTET365,IRALCFY__GT365LTET991,IRALCFY__GT49LTET99,IRALCFY__GT991LTET993,IRALCFY__GT99LTET299,IRALCRC__GT1LTET2,IRALCRC__GT2LTET3,IRALCRC__GT3LTET9,IRCIGRC__GT1LTET2,IRCIGRC__GT2LTET3,IRCIGRC__GT3LTET4,IRCIGRC__GT4LTET9,IRCOCRC__GT1LTET2,IRCOCRC__GT2LTET3,IRCOCRC__GT3LTET9,IRCRKRC__GT1LTET2,IRCRKRC__GT2LTET3,IRCRKRC__GT3LTET9,IRECSTMOREC__GT1LTET2,IRECSTMOREC__GT2LTET3,IRECSTMOREC__GT3LTET9,IRHALLUCREC__GT1LTET2,IRHALLUCREC__GT2LTET3,IRHALLUCREC__GT3LTET9,IRHERRC__GT1LTET2,IRHERRC__GT2LTET3,IRHERRC__GT3LTET9,IRINHALREC__GT1LTET2,IRINHALREC__GT2LTET3,IRINHALREC__GT3LTET9,IRLSDRC__GT1LTET2,IRLSDRC__GT2LTET3,IRLSDRC__GT3LTET9,IRMETHAMREC__GT1LTET2,IRMETHAMREC__GT2LTET3,IRMETHAMREC__GT3LTET9,IRMJFY__GT100LTET200,IRMJFY__GT10LTET20,IRMJFY__GT1LTET2,IRMJFY__GT200LTET300,IRMJFY__GT20LTET30,IRMJFY__GT2LTET3,IRMJFY__GT300LTET365,IRMJFY__GT30LTET40,IRMJFY__GT365LTET985,IRMJFY__GT3LTET7,IRMJFY__GT40LTET50,IRMJFY__GT50LTET100,IRMJFY__GT7LTET10,IRMJFY__GT985LTET991,IRMJFY__GT991LTET993,IRMJRC__GT1LTET2,IRMJRC__GT2LTET3,IRMJRC__GT3LTET9,IRSEX,TOBYR,TXEVRRCVD2,TXYRRECVD2
0,0,0,0,0,0,0,0,0,0,0,-0.821941,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [6]:
#Convert to numpy
inputArr = df.values
inputArr

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.82194126,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1. 

# 3. Generate Predicitons

In [7]:
#Load Models
#
model = joblib.load(dataDir+'calibLR.model')
explainer = joblib.load(dataDir+'modelLRCal.explainer')
probs = np.load(dataDir+'modelLRCalPredProbs.npy')

#XGB
# model = joblib.load(dataDir+'modelXGB.model')
# explainer = joblib.load(dataDir+'modelXGB.explainer')
# probs = np.load(dataDir+'modelXGBPredProbs.npy')

#Load feature names (column names)
colNamesList = joblib.load(dataDir+'colNamesList.zip')
colNamesList

In [8]:
#Calculate Prediciton
predM = model.predict_proba(inputArr)[0][1]
print('Predicted Probability: {:.3%}'.format(predM))

#Calculate Percentile
pct = stats.percentileofscore(probs, predM)/100
print('Percentile of Predicted Probability: {:.3%}'.format(pct))
print(inputArr)
#Generate shapley values from this row
shapVal = explainer.shap_values(inputArr)

#Aggregate shapley values for one-hot vectors
shapDict = defaultdict(list) #Handy: creates blank list if key doesn't exist, or appends to it if it does.

#Get everything before the '_' character of each column name
#Then create the column index numbers for those keys 
#These numbers correspond to the locations in the shapley output array
for i, colName in enumerate(colNamesList):
    shapDict[colName.split('_')[0]].append(i)
    
#Make a list of aggregated values shapley
for k in shapDict: #Loop through every key in the dict
    shapSum = 0.0 #Reset to 0
    for index in shapDict[k]: #Loop through every item in the key's value (a list of column indexes)
        shapSum += shapVal[1][0][index] #Add the value for each item
        #print('index',index,' | k', k, ' | shapVal[1][0][index]', shapVal[1][0][index])
    shapDict[k] = shapSum #Replace the list with the aggregated shapley value (the sum of each individual value)
    #print('NEXT k')

sortedShapDict = dict(sorted(shapDict.items(), key=operator.itemgetter(1)))
print('Feature Importance (sorted low to high):')
sortedShapDict

Predicted Probability: 6.557%
Percentile of Predicted Probability: 26.924%
[[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.         -0.82194126  0.
   0.          0.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          1.
   0.          0.          0.          0.          1.          1.
   0.          0.          0.          1.          0.          1.
   0.          0.          0.          0.          0.          0.
   0.          1.          0.          0.          0.          0.
   0.          0.          1.          0.          0.          1.
   0.          0.          1.          0.          0.          1.
   0.          0.          1.          0.          0.          1.
 

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

  "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \
  "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \



Feature Importance (sorted low to high):


{'ADDSCEV': -0.06613424730495042,
 'CIGAGE': -0.050054942524177935,
 'IRALCAGE': -0.042484038670229714,
 'ADDPREV': -0.04115183232883818,
 'CABINGEVR': -0.018898388798680853,
 'IRALCRC': -0.01443803853243178,
 'IRCOCRC': -0.008995303833057966,
 'IRHALLUCREC': -0.005858888406943902,
 'IRINHALREC': -0.005560425272360635,
 'IRMJFY': -0.005183830289100126,
 'IRCIGRC': -0.005178955890873735,
 'BOOKED': -0.002683875675943838,
 'IRMETHAMREC': -0.002603355241711597,
 'IRECSTMOREC': -0.0018323507280345874,
 'IRHERRC': -0.00180505271324332,
 'TXEVRRCVD2': -0.001099278515363164,
 'TXYRRECVD2': 0.0,
 'IRCRKRC': 0.0012182217909375986,
 'IRLSDRC': 0.003046881283926932,
 'TOBYR': 0.0048875375816410035,
 'FUMJ18': 0.005827514623689557,
 'IRALCFY': 0.00757383811702024,
 'IRSEX': 0.007833253959610598,
 'FUCIG18': 0.008541147207018568,
 'IRMJRC': 0.015838530833250285,
 'AGE2': 0.02876620059208984,
 'EDUHIGHCAT': 0.03120588077205578,
 'CIGDLYMO': 0.04228430733667117}