# Opioid Addiction Project
## Notebook 06: Full Pipeline

This notebook is intended to simulate the full pipeline using .py modules, starting with submitting a user's input, preprocessing that input, then running it through prediction tasks, resulting in the final output of the user's prediction scores.

### W210, Capstone
Summer 2019

Team:  Cameron Kennedy, Aditi Khullar, Rachel Kramer, Sharad Varadarajan

# 0. Load Libraries and Set Global Variables
This analysis is performed in the cells below.

In [1]:
#Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.externals import joblib  #Used to save/load (pickle) models
from collections import defaultdict
import operator
from scipy import stats

#Custom data prep function used in both training and prediction 
import OpioidDataPrep as odp
import OpioidExecution as oe

#Set initial parameter(s)
pd.set_option('display.max_rows', 200)
pd.options.display.max_columns = 150
dataDir = './data/'

print('Pandas Version', pd.__version__)

Pandas Version 0.24.2


# 1. Simulate User Data

This section simulates the user entering various responses to questions. In the actual web tool, this input will come in the form of a dictionary, therefore this data should mimic that format.

In [2]:
#Simulate User Input

inputDict = dict()

'''
[{'NAME': 'sharad', 'AGE2': 27, 'IRSEX': 1, 'IREDUHIGHST2': 11}, {'IRALCAGE': 13, 'IRALCRC': 1, 'IRALCFY': 300, 
'BNGDRKMON': 1, 'HVYDRKMON': 1}, {'TXYRRECVD2': 1, 'TXEVRRCVD2': 1}, {'IRCIGRC': 1, 'CIGDLYMO': 1, 'CIGAGE': 13, 
'PIPEVER': 1, 'IRCGRRC': 1, 'IRSMKLSSREC': 1}, {'IRMJRC': 1, 'MJYRTOT': 300, 'FUMJ18': 1, 'FUMJ21': 1}, 
{'ADDPREV': 1, 'ADDSCEV': 1}, {'BOOKED': 1}]
'''

#DEMOGRAPHICS
inputDict['NAME'] = 'Joe Capstone' #We will delete this column
inputDict['IRSEX'] = 1 #Gender: 'Male' or 'Female'
inputDict['IREDUHIGHST2'] = 11 #Education:
inputDict['AGE2'] = 27 #Age
    
#ALCOHOL
inputDict['IRALCRC'] = 1 #(Alcohol Recency)
inputDict['IRALCFY'] = 300 #(Alcohol Frequency Past Year)
inputDict['BNGDRKMON'] = 1 #(Binge drinking, past 30 days)
inputDict['HVYDRKMON'] = 1 #(Heavy drinking, past 30 days)
inputDict['IRALCAGE'] = 13 #(First time used alcohol)

#DRUGS + ALCOHOL
inputDict['TXYRRECVD2'] = 1 #(Ever alcohol/drug treatment, past yr)
inputDict['TXEVRRCVD2'] = 1 #(Ever alcohol/drug treatment, lifetime)


#TOBACCO
inputDict['IRCIGRC'] = 1 #(Tobacco Recency, incl. Never)
inputDict['CIGDLYMO'] = 1 #(Tobacco 30+ consecutive days)
inputDict['CIGAGE'] = 13 #(Tobacco Use Daily)
inputDict['PIPEVER'] = 1 #(Ever smoked a pipe)
inputDict['IRCGRRC'] = 1 #(Cigar recency)
inputDict['IRSMKLSSREC'] = 1 #(Smokeless tobacco recency)
    
#WEED
inputDict['IRMJRC'] = 1 #(Weed recency)
inputDict['MJYRTOT'] = 300 #(Weed days in past year)
inputDict['FUMJ18'] = 1 #(First used weed prior to age 18)
inputDict['FUMJ21'] = 1 #(First used weed prior to age 21)
    
#DEPRESSION
inputDict['ADDPREV'] = 1 #(Several days of depression)
inputDict['ADDSCEV'] = 1 #(Several days of discouraged about life)
    
##OTHER
inputDict['BOOKED'] = 1 #(Ever arrested & booked)

print(inputDict)

#Convert to dataframe

{'NAME': 'Joe Capstone', 'IRSEX': 1, 'IREDUHIGHST2': 11, 'AGE2': 27, 'IRALCRC': 1, 'IRALCFY': 300, 'BNGDRKMON': 1, 'HVYDRKMON': 1, 'IRALCAGE': 13, 'TXYRRECVD2': 1, 'TXEVRRCVD2': 1, 'IRCIGRC': 1, 'CIGDLYMO': 1, 'CIGAGE': 13, 'PIPEVER': 1, 'IRCGRRC': 1, 'IRSMKLSSREC': 1, 'IRMJRC': 1, 'MJYRTOT': 300, 'FUMJ18': 1, 'FUMJ21': 1, 'ADDPREV': 1, 'ADDSCEV': 1, 'BOOKED': 1}


# 1.b Web App Test

In [3]:
#Web App Test
runWebAppTest = True
predFI = None
if runWebAppTest:
    predProb, predPercentile, predFI = oe.generateReport(inputDict)
    print('Predicted Probability: {:.0%}'.format(predProb))
    print('Percentile of Predicted Probability: {:.0%}'.format(predPercentile))
    print('Feature Importance (sorted low to high):')
predFI

  from numpy.core.umath_tests import inner1d


Predicted Probability: 88%
Percentile of Predicted Probability: 100%
Feature Importance (sorted low to high):


{'IREDUHIGHST2': -0.022064632534958745,
 'IRSEX': -0.0035416959439998324,
 'FUMJ21': -0.0017268124960524575,
 'CIGDLYMO': -0.0015330947424896622,
 'IRALCRC': -0.0014857265828542381,
 'BOOKED': 0.005262318335352478,
 'ADDSCEV': 0.008098832332624248,
 'ADDPREV': 0.01198750346410723,
 'CIGAGE': 0.016979258555984502,
 'IRCIGRC': 0.020177570991787196,
 'IRCGRRC': 0.022628753172388667,
 'HVYDRKMON': 0.025032005624697675,
 'IRALCFY': 0.02573143315830575,
 'PIPEVER': 0.025979095721158616,
 'IRSMKLSSREC': 0.02721074709055188,
 'BNGDRKMON': 0.031940937251929424,
 'IRMJRC': 0.03873099257158695,
 'TXEVRRCVD2': 0.03877888524096162,
 'IRALCAGE': 0.047310605419703784,
 'TXYRRECVD2': 0.05790152850841173,
 'MJYRTOT': 0.0968940547288994,
 'AGE2': 0.10206200675977697,
 'FUMJ18': 0.11324376454000534}

# 2. Preprocess

In [None]:
#Convert inputs to list (pandas conversion to dataframe requires dict values to be lists)
if not runWebAppTest: 
    '''If we run our web app test, these next two lines already run in that and thus
    can't be run here (they'll double-list the dictionary)
    '''
    for k in inputDict:
        inputDict[k] = [inputDict[k]]
print(inputDict)

#Convert dict to dataframe
df = pd.DataFrame.from_dict(inputDict)

#Run preprocessing on dataframe
df = odp.preprocess(df)
df

In [None]:
#Resort by column name (necessary to feed the model)
c = list(df.columns)
c.sort()
df = df[c]

# 3. Generate Predicitons

In [None]:
#Load Models
model = joblib.load(dataDir+'modelXGB.model')
explainer = joblib.load(dataDir+'calibXGB.explainer') ###NEED TO FIX THIS NAME
probs = np.load(dataDir+'modelXGBPredProbs.npy')

In [None]:
#Calculate Prediciton
predM = model.predict_proba(df)[0][1]
print('Predicted Probability: {:.0%}'.format(predM))

#Calculate Percentile
pct = stats.percentileofscore(probs, predM)/100
print('Percentile of Predicted Probability: {:.0%}'.format(pct))

#Generate shapley values from this row
shapVal = explainer.shap_values(df)

#Aggregate shapley values for one-hot vectors
shapDict = defaultdict(list) #Handy: creates blank list if key doesn't exist, or appends to it if it does.

#Get everything before the '_' character of each column name
#Then create the column index numbers for those keys 
#These numbers correspond to the locations in the shapley output array
for i, colName in enumerate(df.columns):
    shapDict[colName.split('_')[0]].append(i)
    
#Make a list of aggregated values shapley
for k in shapDict: #Loop through every key in the dict
    shapSum = 0.0 #Reset to 0
    for index in shapDict[k]: #Loop through every item in the key's value (a list of column indexes)
        shapSum += shapVal[0][index] #Add the value for each item
    shapDict[k] = shapSum #Replace the list with the aggregated shapley value (the sum of each individual value)

sortedShapDict = dict(sorted(shapDict.items(), key=operator.itemgetter(1)))
print('Feature Importance (sorted low to high):')
sortedShapDict