# Opioid Addiction Project
## Notebook 06: Full Pipeline

This notebook is intended to simulate the full pipeline using .py modules, starting with submitting a user's input, preprocessing that input, then running it through prediction tasks, resulting in the final output of the user's prediction scores.

### W210, Capstone
Summer 2019

Team:  Cameron Kennedy, Aditi Khullar, Rachel Kramer, Sharad Varadarajan

# 0. Load Libraries and Set Global Variables
This analysis is performed in the cells below.

In [1]:
#Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.externals import joblib  #Used to save/load (pickle) models
from collections import defaultdict
import operator
from scipy import stats

#Custom data prep function used in both training and prediction 
import OpioidDataPrep as odp
import OpioidExecution as oe

#Set initial parameter(s)
pd.set_option('display.max_rows', 200)
pd.options.display.max_columns = 150
dataDir = './data/'

print('Pandas Version', pd.__version__)

Pandas Version 0.24.2


# 1. Simulate User Data

This section simulates the user entering various responses to questions. In the actual web tool, this input will come in the form of a dictionary, therefore this data should mimic that format.

In [2]:
#Simulate User Input

inputDict = dict()

#DEMOGRAPHICS
inputDict['NAME'] = 'Joe Capstone' #We will delete this column
inputDict['IRSEX'] = 1 #Gender: 'Male' or 'Female'
inputDict['IREDUHIGHST2'] = 10 #Education:
inputDict['AGE2'] = 16 #Age
    
#ALCOHOL
inputDict['IRALCRC'] = 2 #(Alcohol Recency)
inputDict['IRALCFY'] = 12 #(Alcohol Frequency Past Year)
inputDict['BNGDRKMON'] = 1 #(Binge drinking, past 30 days)
inputDict['HVYDRKMON'] = 1 #(Heavy drinking, past 30 days)
inputDict['IRALCAGE'] = 17 #(First time used alcohol)

#DRUGS + ALCOHOL
inputDict['TXYRRECVD2'] = 0 #(Ever alcohol/drug treatment, past yr)
inputDict['TXEVRRCVD2'] = 0 #(Ever alcohol/drug treatment, lifetime)


#TOBACCO
inputDict['IRCIGRC'] = 9 #(Tobacco Recency, incl. Never)
inputDict['CIGDLYMO'] = 91 #(Tobacco 30+ consecutive days)
inputDict['CIGAGE'] = 18 #(Tobacco Use Daily)
inputDict['PIPEVER'] = 2 #(Ever smoked a pipe)
inputDict['IRCGRRC'] = 4 #(Cigar recency)
inputDict['IRSMKLSSREC'] = 4 #(Smokeless tobacco recency)
    
#WEED
inputDict['IRMJRC'] = 1 #(Weed recency)
inputDict['MJYRTOT'] = 250 #(Weed days in past year)
inputDict['FUMJ18'] = 2 #(First used weed prior to age 18)
inputDict['FUMJ21'] = 2 #(First used weed prior to age 21)
    
#DEPRESSION
inputDict['ADDPREV'] = 2 #(Several days of depression)
inputDict['ADDSCEV'] = 2 #(Several days of discouraged about life)
    
##OTHER
inputDict['BOOKED'] = 1 #(Ever arrested & booked)

print(inputDict)

#Convert to dataframe

{'NAME': 'Joe Capstone', 'IRSEX': 1, 'IREDUHIGHST2': 10, 'AGE2': 16, 'IRALCRC': 2, 'IRALCFY': 12, 'BNGDRKMON': 1, 'HVYDRKMON': 1, 'IRALCAGE': 17, 'TXYRRECVD2': 0, 'TXEVRRCVD2': 0, 'IRCIGRC': 9, 'CIGDLYMO': 91, 'CIGAGE': 18, 'PIPEVER': 2, 'IRCGRRC': 4, 'IRSMKLSSREC': 4, 'IRMJRC': 1, 'MJYRTOT': 250, 'FUMJ18': 2, 'FUMJ21': 2, 'ADDPREV': 2, 'ADDSCEV': 2, 'BOOKED': 1}


# 1.b Web App Test

In [3]:
#Web App Test
runWebAppTest = True
predFI = None
if runWebAppTest:
    predProb, predPercentile, predFI = oe.generateReport(inputDict)
    print('Predicted Probability: {:.0%}'.format(predProb))
    print('Percentile of Predicted Probability: {:.0%}'.format(predPercentile))
    print('Feature Importance (sorted low to high):')
predFI

  from numpy.core.umath_tests import inner1d


Predicted Probability: 19%
Percentile of Predicted Probability: 54%
Feature Importance (sorted low to high):


{'AGE2': -0.07034271329944299,
 'FUMJ18': -0.05475428228451216,
 'IRALCAGE': -0.01582246322497748,
 'IREDUHIGHST2': -0.011477780569389818,
 'FUMJ21': -0.008926288815295105,
 'ADDPREV': -0.007881381349512806,
 'ADDSCEV': -0.005710015969690811,
 'IRCGRRC': -0.0048118618005509,
 'TXYRRECVD2': -0.002784593952038717,
 'TXEVRRCVD2': -0.0024843315189008465,
 'IRALCRC': -0.0019848097551581094,
 'CIGAGE': -0.0009058336820166155,
 'CIGDLYMO': -0.0008530496789806548,
 'IRSMKLSSREC': -0.0007598195700566725,
 'PIPEVER': -0.0005910768090568246,
 'IRCIGRC': 0.005773916708237226,
 'IRALCFY': 0.011330906871532287,
 'BOOKED': 0.0171481296606952,
 'BNGDRKMON': 0.017340029782525884,
 'HVYDRKMON': 0.020525789289245318,
 'IRSEX': 0.02216130375120372,
 'IRMJRC': 0.028511010662871428,
 'MJYRTOT': 0.03430067116484938}

# 2. Preprocess

In [4]:
#Convert inputs to list (pandas conversion to dataframe requires dict values to be lists)
if not runWebAppTest: 
    '''If we run our web app test, these next two lines already run in that and thus
    can't be run here (they'll double-list the dictionary)
    '''
    for k in inputDict:
        inputDict[k] = [inputDict[k]]
print(inputDict)

#Convert dict to dataframe
df = pd.DataFrame.from_dict(inputDict)

#Run preprocessing on dataframe
df = odp.preprocess(df)
df

{'NAME': ['Joe Capstone'], 'IRSEX': [1], 'IREDUHIGHST2': [10], 'AGE2': [16], 'IRALCRC': [2], 'IRALCFY': [12], 'BNGDRKMON': [1], 'HVYDRKMON': [1], 'IRALCAGE': [17], 'TXYRRECVD2': [0], 'TXEVRRCVD2': [0], 'IRCIGRC': [9], 'CIGDLYMO': [91], 'CIGAGE': [18], 'PIPEVER': [2], 'IRCGRRC': [4], 'IRSMKLSSREC': [4], 'IRMJRC': [1], 'MJYRTOT': [250], 'FUMJ18': [2], 'FUMJ21': [2], 'ADDPREV': [2], 'ADDSCEV': [2], 'BOOKED': [1]}


Unnamed: 0,IRSEX,IREDUHIGHST2,AGE2,BNGDRKMON,HVYDRKMON,TXYRRECVD2,TXEVRRCVD2,FUMJ18,FUMJ21,IRCIGRC__GT1LTET2,IRCIGRC__GT2LTET3,IRCIGRC__GT3LTET4,IRCIGRC__GT4LTET9,CIGDLYMO__GT1LTET2,CIGDLYMO__GT2LTET5,CIGDLYMO__GT5LTET91,CIGDLYMO__GT91LTET94,CIGDLYMO__GT94LTET97,CIGAGE__GT10LTET13,CIGAGE__GT13LTET15,CIGAGE__GT15LTET17,CIGAGE__GT17LTET18,CIGAGE__GT18LTET19,CIGAGE__GT19LTET20,CIGAGE__GT20LTET22,CIGAGE__GT22LTET25,CIGAGE__GT25LTET30,CIGAGE__GT30LTET40,CIGAGE__GT40LTET50,CIGAGE__GT50LTET99,CIGAGE__GT99LTET985,CIGAGE__GT985LTET991,CIGAGE__GT991LTET994,CIGAGE__GT994LTET997,CIGAGE__GT997LTET998,CIGAGE__GT998LTET999,IRSMKLSSREC__GT1LTET2,IRSMKLSSREC__GT2LTET3,IRSMKLSSREC__GT3LTET4,IRSMKLSSREC__GT4LTET9,IRCGRRC__GT1LTET2,IRCGRRC__GT2LTET3,IRCGRRC__GT3LTET4,IRCGRRC__GT4LTET9,PIPEVER__GT1LTET2,PIPEVER__GT2LTET94,PIPEVER__GT94LTET97,IRMJRC__GT1LTET2,IRMJRC__GT2LTET3,IRMJRC__GT3LTET9,MJYRTOT__GT1LTET2,MJYRTOT__GT2LTET3,MJYRTOT__GT3LTET7,MJYRTOT__GT7LTET10,MJYRTOT__GT10LTET20,MJYRTOT__GT20LTET30,MJYRTOT__GT30LTET40,MJYRTOT__GT40LTET50,MJYRTOT__GT50LTET100,MJYRTOT__GT100LTET150,MJYRTOT__GT150LTET200,MJYRTOT__GT200LTET250,MJYRTOT__GT250LTET365,MJYRTOT__GT365LTET985,MJYRTOT__GT985LTET991,MJYRTOT__GT991LTET993,MJYRTOT__GT993LTET994,MJYRTOT__GT994LTET997,MJYRTOT__GT997LTET998,IRALCRC__GT1LTET2,IRALCRC__GT2LTET3,IRALCRC__GT3LTET9,IRALCFY__GT1LTET2,IRALCFY__GT2LTET3,IRALCFY__GT3LTET7,IRALCFY__GT7LTET10,IRALCFY__GT10LTET20,IRALCFY__GT20LTET30,IRALCFY__GT30LTET40,IRALCFY__GT40LTET50,IRALCFY__GT50LTET100,IRALCFY__GT100LTET150,IRALCFY__GT150LTET200,IRALCFY__GT200LTET250,IRALCFY__GT250LTET365,IRALCFY__GT365LTET991,IRALCFY__GT991LTET993,IRALCAGE__GT10LTET13,IRALCAGE__GT13LTET15,IRALCAGE__GT15LTET16,IRALCAGE__GT16LTET17,IRALCAGE__GT17LTET18,IRALCAGE__GT18LTET19,IRALCAGE__GT19LTET20,IRALCAGE__GT20LTET21,IRALCAGE__GT21LTET22,IRALCAGE__GT22LTET23,IRALCAGE__GT23LTET25,IRALCAGE__GT25LTET30,IRALCAGE__GT30LTET40,IRALCAGE__GT40LTET50,IRALCAGE__GT50LTET100,IRALCAGE__GT100LTET991,ADDPREV__GT1LTET2,ADDPREV__GT2LTET85,ADDPREV__GT85LTET94,ADDPREV__GT94LTET97,ADDPREV__GT97LTET98,ADDPREV__GT98LTET99,ADDSCEV__GT1LTET2,ADDSCEV__GT2LTET94,ADDSCEV__GT94LTET97,ADDSCEV__GT97LTET98,ADDSCEV__GT98LTET99,BOOKED__GT1LTET2,BOOKED__GT2LTET3,BOOKED__GT3LTET85,BOOKED__GT85LTET94,BOOKED__GT94LTET97,BOOKED__GT97LTET98
0,1,0.838544,1.37925,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [5]:
#Resort by column name (necessary to feed the model)
c = list(df.columns)
c.sort()
df = df[c]

# 3. Generate Predicitons

In [6]:
#Load Models
model = joblib.load(dataDir+'modelXGB.model')
explainer = joblib.load(dataDir+'calibXGB.explainer') ###NEED TO FIX THIS NAME
probs = np.load(dataDir+'modelXGBPredProbs.npy')



In [7]:
#Calculate Prediciton
predM = model.predict_proba(df)[0][1]
print('Predicted Probability: {:.0%}'.format(predM))

#Calculate Percentile
pct = stats.percentileofscore(probs, predM)/100
print('Percentile of Predicted Probability: {:.0%}'.format(pct))

#Generate shapley values from this row
shapVal = explainer.shap_values(df)

#Aggregate shapley values for one-hot vectors
shapDict = defaultdict(list) #Handy: creates blank list if key doesn't exist, or appends to it if it does.

#Get everything before the '_' character of each column name
#Then create the column index numbers for those keys 
#These numbers correspond to the locations in the shapley output array
for i, colName in enumerate(df.columns):
    shapDict[colName.split('_')[0]].append(i)
    
#Make a list of aggregated values shapley
for k in shapDict: #Loop through every key in the dict
    shapSum = 0.0 #Reset to 0
    for index in shapDict[k]: #Loop through every item in the key's value (a list of column indexes)
        shapSum += shapVal[0][index] #Add the value for each item
    shapDict[k] = shapSum #Replace the list with the aggregated shapley value (the sum of each individual value)

sortedShapDict = dict(sorted(shapDict.items(), key=operator.itemgetter(1)))
print('Feature Importance (sorted low to high):')
sortedShapDict

Predicted Probability: 19%
Percentile of Predicted Probability: 54%
Feature Importance (sorted low to high):


{'AGE2': -0.07034271329944299,
 'FUMJ18': -0.05475428228451216,
 'IRALCAGE': -0.01582246322497748,
 'IREDUHIGHST2': -0.011477780569389818,
 'FUMJ21': -0.008926288815295105,
 'ADDPREV': -0.007881381349512806,
 'ADDSCEV': -0.005710015969690811,
 'IRCGRRC': -0.0048118618005509,
 'TXYRRECVD2': -0.002784593952038717,
 'TXEVRRCVD2': -0.0024843315189008465,
 'IRALCRC': -0.0019848097551581094,
 'CIGAGE': -0.0009058336820166155,
 'CIGDLYMO': -0.0008530496789806548,
 'IRSMKLSSREC': -0.0007598195700566725,
 'PIPEVER': -0.0005910768090568246,
 'IRCIGRC': 0.005773916708237226,
 'IRALCFY': 0.011330906871532287,
 'BOOKED': 0.0171481296606952,
 'BNGDRKMON': 0.017340029782525884,
 'HVYDRKMON': 0.020525789289245318,
 'IRSEX': 0.02216130375120372,
 'IRMJRC': 0.028511010662871428,
 'MJYRTOT': 0.03430067116484938}