# Opioid Addiction Project
## Notebook 06: Full Pipeline

This notebook is intended to simulate the full pipeline using .py modules, starting with submitting a user's input, preprocessing that input, then running it through prediction tasks, resulting in the final output of the user's prediction scores.

### W210, Capstone
Summer 2019

Team:  Cameron Kennedy, Aditi Khullar, Rachel Kramer, Sharad Varadarajan

# 0. Load Libraries and Set Global Variables
This analysis is performed in the cells below.

In [1]:
#Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.externals import joblib  #Used to save/load (pickle) models
from collections import defaultdict
import operator
from scipy import stats

#Custom data prep function used in both training and prediction 
import OpioidDataPrep as odp
import OpioidExecution as oe

#Set initial parameter(s)
pd.set_option('display.max_rows', 200)
pd.options.display.max_columns = 150
dataDir = './data/'

print('Pandas Version', pd.__version__)

Pandas Version 0.24.2




# 1. Simulate User Data

This section simulates the user entering various responses to questions. In the actual web tool, this input will come in the form of a dictionary, therefore this data should mimic that format.

In [2]:
#Simulate User Input

inputDict = dict()

'''
[{'NAME': 'sharad', 'AGE2': 27, 'IRSEX': 1, 'IREDUHIGHST2': 11}, {'IRALCAGE': 13, 'IRALCRC': 1, 'IRALCFY': 300, 
'BNGDRKMON': 1, 'HVYDRKMON': 1}, {'TXYRRECVD2': 1, 'TXEVRRCVD2': 1}, {'IRCIGRC': 1, 'CIGDLYMO': 1, 'CIGAGE': 13, 
'PIPEVER': 1, 'IRCGRRC': 1, 'IRSMKLSSREC': 1}, {'IRMJRC': 1, 'MJYRTOT': 300, 'FUMJ18': 1, 'FUMJ21': 1}, 
{'ADDPREV': 1, 'ADDSCEV': 1}, {'BOOKED': 1}]
'''

#DEMOGRAPHICS
inputDict['NAME'] = 'Joe Capstone' #We will delete this column
inputDict['IRSEX'] = 1 #Gender: 'Male' or 'Female'
inputDict['EDUHIGHCAT'] = 1 #Education:
inputDict['AGE2'] = 10 #Age: Remember, don't enter an age, but an age code from the codebook
    
#ALCOHOL
inputDict['IRALCRC'] = 1 #(Alcohol Recency)
inputDict['IRALCFY'] = 300 #(Alcohol Frequency Past Year)
inputDict['BNGDRKMON'] = 1 #(Binge drinking, past 30 days)
inputDict['HVYDRKMON'] = 1 #(Heavy drinking, past 30 days)
inputDict['IRALCAGE'] = 13 #(First time used alcohol)

#DRUGS + ALCOHOL
inputDict['TXYRRECVD2'] = 1 #(Ever alcohol/drug treatment, past yr)
inputDict['TXEVRRCVD2'] = 1 #(Ever alcohol/drug treatment, lifetime)


#TOBACCO
inputDict['IRCIGRC'] = 1 #(Tobacco Recency, incl. Never)
inputDict['CIGDLYMO'] = 1 #(Tobacco 30+ consecutive days)
inputDict['CIGAGE'] = 13 #(Tobacco Use Daily)
inputDict['TOBYR'] = 1 #(Used any tobacco product in past year, cigar, cigarette, etc.)
inputDict['FUCIG18'] = 1 #(Used cigarettes before 18)
    
#WEED
inputDict['IRMJRC'] = 1 #(Weed recency)
inputDict['MJYRTOT'] = 300 #(Weed days in past year)
inputDict['FUMJ18'] = 1 #(First used weed prior to age 18)

#HARD DRUGS
inputDict['IRCOCRC'] = 1 #(Cocaine Recency)
inputDict['IRCRKRC'] = 1 #(Crack Recency)
inputDict['IRHERRC'] = 1 #(Heronie Recency)
inputDict['IRHALLUCREC'] = 1 #(Hallucinogen Recency)
inputDict['IRLSDRC'] = 1 #(LSD Recency)
inputDict['IRECSTMOREC'] = 1 #(Ecstacy Recency)
inputDict['IRINHALREC'] = 1 #(Inhalant Recency)
inputDict['IRMETHAMREC'] = 1 #(Meth Recency)

#DEPRESSION
inputDict['ADDPREV'] = 1 #(Several days of depression)
inputDict['ADDSCEV'] = 1 #(Several days of discouraged about life)
    
##OTHER
inputDict['BOOKED'] = 1 #(Ever arrested & booked)

print(inputDict)

#Convert to dataframe

{'NAME': 'Joe Capstone', 'IRSEX': 1, 'EDUHIGHCAT': 1, 'AGE2': 10, 'IRALCRC': 1, 'IRALCFY': 300, 'BNGDRKMON': 1, 'HVYDRKMON': 1, 'IRALCAGE': 13, 'TXYRRECVD2': 1, 'TXEVRRCVD2': 1, 'IRCIGRC': 1, 'CIGDLYMO': 1, 'CIGAGE': 13, 'TOBYR': 1, 'FUCIG18': 1, 'IRMJRC': 1, 'MJYRTOT': 300, 'FUMJ18': 1, 'IRCOCRC': 1, 'IRCRKRC': 1, 'IRHERRC': 1, 'IRHALLUCREC': 1, 'IRLSDRC': 1, 'IRECSTMOREC': 1, 'IRINHALREC': 1, 'IRMETHAMREC': 1, 'ADDPREV': 1, 'ADDSCEV': 1, 'BOOKED': 1}


# 1.b Web App Test

In [3]:
#Web App Test
runWebAppTest = True
predFI = None
if runWebAppTest:
    predProb, predPercentile, predFI = oe.generateReport(inputDict)
    print('Predicted Probability: {:.0%}'.format(predProb))
    print('Percentile of Predicted Probability: {:.0%}'.format(predPercentile))
    print('Feature Importance (sorted low to high):')
predFI

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

  "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \
  "l1_reg=\"auto\" is deprecated and in the next version (v0.29) the behavior will change from a " \



Predicted Probability: 100%
Percentile of Predicted Probability: 100%
Feature Importance (sorted low to high):


{'IRHERRC': -0.26379721320060784,
 'IRMETHAMREC': -0.08862917640620982,
 'IRCOCRC': -0.08214861378602983,
 'TXYRRECVD2': -0.073244394572053,
 'IRHALLUCREC': -0.06124040212983778,
 'IRMJRC': -0.055176169363102434,
 'IRALCFY': -0.04583042098862061,
 'MJYRTOT': -0.04497131303443824,
 'AGE2': -0.04336483109070938,
 'IRINHALREC': -0.03750366171365804,
 'IRCIGRC': -0.02691198017321189,
 'BNGDRKMON': -0.023559372372127962,
 'IRSEX': -0.012755859039804696,
 'FUMJ18': -0.007093005114793494,
 'TXEVRRCVD2': -0.006535275911450089,
 'HVYDRKMON': -0.004271845668672725,
 'IRALCRC': -0.003628604809423619,
 'BOOKED': 0.0,
 'CIGDLYMO': 0.0,
 'EDUHIGHCAT': 0.0,
 'FUCIG18': 0.0,
 'IRALCAGE': 0.0,
 'IRECSTMOREC': 0.0,
 'TOBYR': 0.0,
 'IRCRKRC': 0.027348352340448856,
 'IRLSDRC': 0.027746858264668994,
 'ADDPREV': 0.04075247096635881,
 'CIGAGE': 0.04711927855967421,
 'ADDSCEV': 0.05116025186075901}

# 2. Preprocess

In [4]:
#Convert inputs to list (pandas conversion to dataframe requires dict values to be lists)
if not runWebAppTest: 
    '''If we run our web app test, these next two lines already run in that and thus
    can't be run here (they'll double-list the dictionary)
    '''
    for k in inputDict:
        inputDict[k] = [inputDict[k]]
print(inputDict)

#Convert dict to dataframe
df = pd.DataFrame.from_dict(inputDict)

#Run preprocessing on dataframe
df = odp.preprocess(df)
df

{'NAME': ['Joe Capstone'], 'IRSEX': [1], 'EDUHIGHCAT': [1], 'AGE2': [10], 'IRALCRC': [1], 'IRALCFY': [300], 'BNGDRKMON': [1], 'HVYDRKMON': [1], 'IRALCAGE': [13], 'TXYRRECVD2': [1], 'TXEVRRCVD2': [1], 'IRCIGRC': [1], 'CIGDLYMO': [1], 'CIGAGE': [13], 'TOBYR': [1], 'FUCIG18': [1], 'IRMJRC': [1], 'MJYRTOT': [300], 'FUMJ18': [1], 'IRCOCRC': [1], 'IRCRKRC': [1], 'IRHERRC': [1], 'IRHALLUCREC': [1], 'IRLSDRC': [1], 'IRECSTMOREC': [1], 'IRINHALREC': [1], 'IRMETHAMREC': [1], 'ADDPREV': [1], 'ADDSCEV': [1], 'BOOKED': [1]}


Unnamed: 0,IRSEX,AGE2,BNGDRKMON,HVYDRKMON,TXYRRECVD2,TXEVRRCVD2,TOBYR,FUCIG18,FUMJ18,IRCIGRC__GT1LTET2,IRCIGRC__GT2LTET3,IRCIGRC__GT3LTET4,IRCIGRC__GT4LTET9,CIGDLYMO__GT1LTET2,CIGDLYMO__GT2LTET5,CIGDLYMO__GT5LTET91,CIGDLYMO__GT91LTET94,CIGDLYMO__GT94LTET97,CIGAGE__GT10LTET13,CIGAGE__GT13LTET15,CIGAGE__GT15LTET17,CIGAGE__GT17LTET18,CIGAGE__GT18LTET19,CIGAGE__GT19LTET20,CIGAGE__GT20LTET22,CIGAGE__GT22LTET25,CIGAGE__GT25LTET30,CIGAGE__GT30LTET40,CIGAGE__GT40LTET50,CIGAGE__GT50LTET99,CIGAGE__GT99LTET985,CIGAGE__GT985LTET991,CIGAGE__GT991LTET994,CIGAGE__GT994LTET997,CIGAGE__GT997LTET998,CIGAGE__GT998LTET999,IRMJRC__GT1LTET2,IRMJRC__GT2LTET3,IRMJRC__GT3LTET9,MJYRTOT__GT1LTET2,MJYRTOT__GT2LTET3,MJYRTOT__GT3LTET7,MJYRTOT__GT7LTET10,MJYRTOT__GT10LTET20,MJYRTOT__GT20LTET30,MJYRTOT__GT30LTET40,MJYRTOT__GT40LTET50,MJYRTOT__GT50LTET100,MJYRTOT__GT100LTET150,MJYRTOT__GT150LTET200,MJYRTOT__GT200LTET250,MJYRTOT__GT250LTET365,MJYRTOT__GT365LTET985,MJYRTOT__GT985LTET991,MJYRTOT__GT991LTET993,MJYRTOT__GT993LTET994,MJYRTOT__GT994LTET997,MJYRTOT__GT997LTET998,IRCOCRC__GT1LTET2,IRCOCRC__GT2LTET3,IRCOCRC__GT3LTET9,IRCRKRC__GT1LTET2,IRCRKRC__GT2LTET3,IRCRKRC__GT3LTET9,IRHERRC__GT1LTET2,IRHERRC__GT2LTET3,IRHERRC__GT3LTET9,IRHALLUCREC__GT1LTET2,IRHALLUCREC__GT2LTET3,IRHALLUCREC__GT3LTET9,IRLSDRC__GT1LTET2,IRLSDRC__GT2LTET3,IRLSDRC__GT3LTET9,IRECSTMOREC__GT1LTET2,IRECSTMOREC__GT2LTET3,IRECSTMOREC__GT3LTET9,IRINHALREC__GT1LTET2,IRINHALREC__GT2LTET3,IRINHALREC__GT3LTET9,IRMETHAMREC__GT1LTET2,IRMETHAMREC__GT2LTET3,IRMETHAMREC__GT3LTET9,IRALCRC__GT1LTET2,IRALCRC__GT2LTET3,IRALCRC__GT3LTET9,IRALCFY__GT11LTET49,IRALCFY__GT49LTET99,IRALCFY__GT99LTET299,IRALCFY__GT299LTET365,IRALCFY__GT365LTET993,IRALCAGE__GT14LTET17,IRALCAGE__GT17LTET20,IRALCAGE__GT20LTET21,IRALCAGE__GT21LTET100,IRALCAGE__GT100LTET991,ADDPREV__GT1LTET2,ADDPREV__GT2LTET85,ADDPREV__GT85LTET94,ADDPREV__GT94LTET97,ADDPREV__GT97LTET98,ADDPREV__GT98LTET99,ADDSCEV__GT1LTET2,ADDSCEV__GT2LTET94,ADDSCEV__GT94LTET97,ADDSCEV__GT97LTET98,ADDSCEV__GT98LTET99,EDUHIGHCAT__GT1LTET2,EDUHIGHCAT__GT2LTET3,EDUHIGHCAT__GT3LTET4,EDUHIGHCAT__GT4LTET5,BOOKED__GT1LTET2,BOOKED__GT2LTET3,BOOKED__GT3LTET85,BOOKED__GT85LTET94,BOOKED__GT94LTET97,BOOKED__GT97LTET98
0,1,-0.821941,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
#Resort by column name (necessary to feed the model)
df = odp.sortDFbyColName(df)

# 3. Generate Predicitons

In [6]:
#Load Models
model = joblib.load(dataDir+'modelXGB.model')
explainer = joblib.load(dataDir+'modelXGB.explainer') ###NEED TO FIX THIS NAME
probs = np.load(dataDir+'modelXGBPredProbs.npy')

In [7]:
#Calculate Prediciton
predM = model.predict_proba(df)[0][1]
print('Predicted Probability: {:.0%}'.format(predM))

#Calculate Percentile
pct = stats.percentileofscore(probs, predM)/100
print('Percentile of Predicted Probability: {:.0%}'.format(pct))

#Generate shapley values from this row
shapVal = explainer.shap_values(df)

#Aggregate shapley values for one-hot vectors
shapDict = defaultdict(list) #Handy: creates blank list if key doesn't exist, or appends to it if it does.

#Get everything before the '_' character of each column name
#Then create the column index numbers for those keys 
#These numbers correspond to the locations in the shapley output array
for i, colName in enumerate(df.columns):
    shapDict[colName.split('_')[0]].append(i)
    
#Make a list of aggregated values shapley
for k in shapDict: #Loop through every key in the dict
    shapSum = 0.0 #Reset to 0
    for index in shapDict[k]: #Loop through every item in the key's value (a list of column indexes)
        shapSum += shapVal[0][index] #Add the value for each item
    shapDict[k] = shapSum #Replace the list with the aggregated shapley value (the sum of each individual value)

sortedShapDict = dict(sorted(shapDict.items(), key=operator.itemgetter(1)))
print('Feature Importance (sorted low to high):')
sortedShapDict

Predicted Probability: 86%
Percentile of Predicted Probability: 100%
Feature Importance (sorted low to high):


{'EDUHIGHCAT': -0.0021935792159650104,
 'CIGAGE': -0.0008303263945222013,
 'IRALCRC': -0.0006611972139493815,
 'CIGDLYMO': -0.00026649995338464896,
 'IRCRKRC': 0.0027281598217542024,
 'ADDSCEV': 0.0033298357517283334,
 'ADDPREV': 0.00343817348622385,
 'IRALCFY': 0.003460106364940471,
 'FUCIG18': 0.0040184089721982735,
 'IRMETHAMREC': 0.004282882109325955,
 'IRALCAGE': 0.004332542444573209,
 'BOOKED': 0.004643238291372727,
 'IRCIGRC': 0.00922987188991701,
 'IRLSDRC': 0.009482614160373132,
 'BNGDRKMON': 0.010092777871852844,
 'HVYDRKMON': 0.013334964570338592,
 'TXYRRECVD2': 0.014156019608149826,
 'TXEVRRCVD2': 0.014678119289093568,
 'IRSEX': 0.015079223614524606,
 'IRECSTMOREC': 0.015634444437217374,
 'MJYRTOT': 0.018681136843292343,
 'TOBYR': 0.023408927596039148,
 'IRINHALREC': 0.029627892156281706,
 'IRMJRC': 0.03042846629809108,
 'IRHALLUCREC': 0.037571602799557645,
 'IRCOCRC': 0.04387296330175467,
 'IRHERRC': 0.05256870886890763,
 'AGE2': 0.05390153166220909,
 'FUMJ18': 0.076456880