# Opioid Addiction Project
## Notebook 03: Feature Selection

This notebook does a few things:

1. Removes most of the features (columns), keeping only those which are used in the model
2. Performs preprocessing on the features as needed, including:
  * Re-coding features
  * Bin values as needed
  * One-hot encodes categorical features
  * Scaling and centering continuous features  

### W210, Capstone
Summer 2019

Team:  Cameron Kennedy, Aditi Khullar, Rachel Kramer, Sharad Varadarajan

# 0. Load Libraries and Set Global Variables
This analysis is performed in the cells below.

In [1]:
#Import Required Libraries
import numpy as np
import pandas as pd
import re

#Custom data prep function used in both training and prediction 
import OpioidDataPrep as odp

#Set initial parameter(s)
pd.set_option('display.max_rows', 200)
pd.options.display.max_columns = 150
dataDir = './data/'

print('Pandas Version', pd.__version__)

Pandas Version 0.24.2


# 1. Load Data

This step loads the data from the file `misuse.pickle.zip`.

In [2]:
#Load Data
df = pd.read_pickle(dataDir+'misuse.pickle.zip')
df

Unnamed: 0,AALTMDE,ABODALC,ABODCOC,ABODHER,ABODMRJ,ABPYILANAL,ABPYILLALC,ABUPOSHAL,ABUPOSINH,ABUPOSMTH,ABUPOSPNR,ABUPOSSED,ABUPOSSTM,ABUPOSTRQ,ABUSEALC,ABUSECOC,ABUSEHER,ABUSEMRJ,ABUSEPYHAL,ABUSEPYIEM,ABUSEPYILL,ABUSEPYINH,ABUSEPYMTH,ABUSEPYPNR,ABUSEPYPSY,ABUSEPYSED,ABUSEPYSTM,ABUSEPYTRQ,ACOUNMDE,ACTD2001,ACTD7590,ACTD9001,ACTDEVER,ACTDPRIV,ACTDVIET,ADCOUNS,ADDPDISC,ADDPLSIN,ADDPPROB,ADDPR2WK,ADDPREV,ADDSCEV,ADDSLSIN,ADFAMDOC,ADHERBAL,ADLOSEV,ADLSI2WK,ADNURSE,ADOCMDE,ADOTHDOC,ADOTHHLP,ADOTHMHP,ADPB2WK,ADPBAGE,ADPBDLYA,ADPBINTF,ADPBNUM,ADPBRMBR,ADPSDAYS,ADPSHMGT,ADPSRELS,ADPSSOC,ADPSWORK,ADPSYCH,ADPSYMD,ADRELIG,ADRX12MO,ADRXHLP,ADRXNOW,ADSEEDOC,ADSMMDEA,ADSOCWRK,ADTMTHLP,ADTMTNOW,ADWRAGE,...,YUIHFEAR,YUIHFITE,YUIHFMLY,YUIHFRND,YUIHOTPP,YUIHSCHL,YUIHSOR,YUIHSUIC,YUIHTPN2,YUIHTPYR,YUJVDTN2,YUJVDTON,YUJVDTYR,YUMHANGR,YUMHBKRU,YUMHCRN2,YUMHCRYR,YUMHDEPR,YUMHEATP,YUMHFEAR,YUMHFITE,YUMHFMLY,YUMHFRND,YUMHOTPP,YUMHSCHL,YUMHSOR,YUMHSUIC,YURSANGR,YURSBKRU,YURSDEPR,YURSEATP,YURSFEAR,YURSFITE,YURSFMLY,YURSFRND,YURSIDN2,YURSIDYR,YURSOTPP,YURSSCHL,YURSSOR,YURSSUIC,YUSCEMYR,YUSCPGYR,YUSWANGR,YUSWBKRU,YUSWDEPR,YUSWEATP,YUSWFEAR,YUSWFITE,YUSWFMLY,YUSWFRND,YUSWOTPP,YUSWSCHL,YUSWSCYR,YUSWSOR,YUSWSUIC,YUTPANGR,YUTPBKRU,YUTPDEPR,YUTPEATP,YUTPFEAR,YUTPFITE,YUTPFMLY,YUTPFRND,YUTPOTPP,YUTPSCHL,YUTPSOR,YUTPSTN2,YUTPSTYR,YUTPSUIC,ZALEPDAPYU,ZOHYANYYR2,ZOLPPDAPYU,ZOLPPDPYMU,MISUSE
19,,0,0,0,0,0,0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,...,99,99,99,99,99,99,99,99,999,2,999,2,99,99,99,999,2,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,2,99,99,99,99,2,2,99,99,99,99,99,99,99,99,99,99,2,99,99,99,99,99,99,99,99,99,99,99,99,99,999,2,99,0,0,0,0,0
20,0.0,1,0,0,0,0,1,93.0,91.0,91.0,91.0,91.0,91.0,91.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,99,99,99,99,99,99,99,1,1,1,1,1,99,99,99,99,99,99,99,0.0,99,99,99,2,15,2,2,4,2,999,99,99,99,99,99,99,99,2,99,99,2,1,99,99,99,13,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,0
21,,0,0,0,0,0,0,91.0,93.0,91.0,91.0,91.0,91.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,2,2,99,99,99,2,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,98,99,99,99,999,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,0
23,,0,0,0,0,0,0,91.0,91.0,91.0,93.0,91.0,91.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,2,2,99,99,99,2,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,98,99,99,99,999,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,0
33,,0,0,0,0,0,0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,2,2,99,99,99,2,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,98,99,99,99,999,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,0
34,,0,0,0,0,0,0,91.0,91.0,91.0,0.0,91.0,0.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,2,2,99,99,99,2,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,98,99,99,99,999,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,1
36,,0,0,0,0,0,0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,2,2,2,1,1,2,99,99,99,99,2,2,1,2,99,99,99,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,98,99,99,99,999,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,0
46,,0,0,0,0,0,0,0.0,93.0,91.0,0.0,91.0,91.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,2,2,99,99,99,2,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,98,99,99,99,999,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,1
50,,0,0,0,0,0,0,91.0,93.0,91.0,91.0,91.0,91.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,2,2,99,99,99,2,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,98,99,99,99,999,...,99,99,99,99,99,99,99,99,999,99,999,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,99,99,0,0,0,0,0
52,,0,0,0,0,0,0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,,99,99,99,99,999,99,99,9999,99,999,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,...,99,99,99,99,99,99,99,99,999,2,999,2,99,99,99,999,2,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,999,2,99,99,99,99,2,2,99,99,99,99,99,99,99,99,99,99,2,99,99,99,99,99,99,99,99,99,99,99,99,99,999,2,99,0,0,0,0,0


# 2. Feature Selection

Here we'll keep only the features we want in our final model.

In addition to our feature selection, we'll also tag each variable with the types of preprocessing required for each one individually. Tagging them here helps ensure they don't make it to the model without having first been preprocessed.

In [3]:
#Function to subtract 1 from values
def prepSubtractOne(dfIn, colIn, reverse=True):
    '''For variables initially coded as 1=Yes, 2=No, the reverse flag
    re-codes them such that 1=Yes, 0=No.
    '''
    if reverse:
        dfIn[colIn] = 2-dfIn[colIn]
    else:
        dfIn[colIn] = dfIn[colIn]-1

In [4]:
#Function to scale and center data
def prepScaleAndCenter(dfIn, colIn):
    '''Quick function to scale and center (non-binary) numerical data'''
    dfIn[colIn] = dfIn[colIn] - np.mean(dfIn[colIn])
    dfIn[colIn] = dfIn[colIn] / np.std(dfIn[colIn])

In [5]:
#Function to recode a variable
def prepRecode(dfIn, colIn, recodeDict):
    dfIn[colIn] = dfIn[colIn].map(recodeDict)

In [6]:
def prepOneHot(dfIn, colIn, dropOrigCol=True):
    dfIn = pd.concat([dfIn, pd.get_dummies(df[colIn], prefix=colIn, drop_first=True)], axis=1)
    #Set drop_first=True to prevent multicollinearity
    if dropOrigCol:
        dfIn.drop([colIn], axis=1, inplace=True)
    return dfIn

In [7]:
def prepBin(dfIn, colIn, cutPoints, dropOrigCol=True):
    '''Function to split data into bins.
    
    First makes the bin assignments, then uses the prepOneHot to one hot encode the results.
    
    Cutpoints submitted will occur between the values in cutPoints, but not outside of them.
    For example, if cutPoints is [1, 2, 3], it will include all numbers between 1 and 2,
    including 1 and 2, and then all numbers between 2 and 3, including 3, but it will return
    NaN for values < 1 and > 3.
    
    Recall cutpoint notation: (a, b] = all real numbers between a and b, including b.    
    
    Labels are required because the default (a, b] notation is rejected by subsequent ML 
    algorithms. 'GT' refers to 'Greater than or equal to, and LTET' refers to 'Less than 
    or equal to'.
    '''
    #labels = ['LTET'+str(cutPoint) for cutPoint in cutPoints[1:]] #[1:] drops the first label
    labels = []
    for i, cutPoint in enumerate(cutPoints[:-1]):
        labels.append('GT'+str(cutPoint)+'LTET'+str(cutPoints[i+1]))
    #print(labels)
    
    dfIn[colIn+'_'] = pd.cut(dfIn[colIn], bins=cutPoints, include_lowest=True, labels=labels)
    if dropOrigCol:
        dfIn.drop([colIn], axis=1, inplace=True)
    return prepOneHot(dfIn, colIn+'_')

In [8]:
# #Column finder
# '''This snippet is here to quickly look for columns in our data that match a pattern.
# It's not needed for the file to run; rather, it's just a utility for finding column names.
# '''
# def colFinder(pattern):
#     print(list(filter(re.compile('.*'+pattern).match, df.columns)))

odp.colFinder(df, 'DRV')

['BKDRVINF']


In [9]:
#List of variables to keep, along with a quick description and proessing strategy
colsToKeep = [#DEMOGRAPHICS
              'IRSEX', #(Gender):  Subtract 1 from this field to make it 0=Male, 1=Female
              'IREDUHIGHST2', #(Highest Education)
              'AGE2', #(Age): Scale and center
    
              #ALCOHOL
              'IRALCRC', #(Alcohol Recency): Straight One Hot
              'IRALCFY', #(Alcohol Frequency Past Year):  Bin
              'BNGDRKMON', #(Binge drinking, past 30 days):  No action required (already binary, 0 and 1)
              'HVYDRKMON', #(Heavy drinking, past 30 days):  No action required (already binary, 0 and 1)
              'IRALCAGE', #(First time used alcohol): Bin

              #DRUGS + ALCOHOL
              'TXYRRECVD2', #(Ever alcohol/drug treatment, past yr): No action required (already binary, 0 and 1)
              'TXEVRRCVD2', #(Ever alcohol/drug treatment, lifetime): No action required (already binary, 0 and 1)

              #'DRVINALCO2', #Not found in data
              #TOBACCO
              'IRCIGRC', #(Tobacco Recency, incl. Never):  Straight One Hot
              'CIGDLYMO', #(Tobacco 30+ consecutive days):  Straight One Hot
              'CIGAGE', #(Tobacco Use Daily):  Bin
              'PIPEVER', #(Ever smoked a pipe): Straight One Hot
              'IRCGRRC', #(Cigar recency): Straight One Hot
              'IRSMKLSSREC', #(Smokeless tobacco recency): Straight One Hot
    
              #WEED
              'IRMJRC', #(Weed recency): Straight One Hot
              'MJYRTOT', #(Weed days in past year):  Bin
              'FUMJ18', #(First used weed prior to age 18): Subtract 1
              'FUMJ21', #(First used weed prior to age 21): Subtract 1
              #'DRVINMARJ', #Not found in data
    
              #DEPRESSION
              'ADDPREV', #(Several days of depression): One Hot
              'ADDSCEV', #(Several days of discouraged about life): One Hot
    
              ##OTHER
              'BOOKED', #(Ever arrested & booked): Recode + One Hot
                      
              #OUTCOME VARIABLE    
              'MISUSE',
             ]

df = df[colsToKeep] #Use colsToKeep.keys() if using dictionary method
df

Unnamed: 0,IRSEX,IREDUHIGHST2,AGE2,IRALCRC,IRALCFY,BNGDRKMON,HVYDRKMON,IRALCAGE,TXYRRECVD2,TXEVRRCVD2,IRCIGRC,CIGDLYMO,CIGAGE,PIPEVER,IRCGRRC,IRSMKLSSREC,IRMJRC,MJYRTOT,FUMJ18,FUMJ21,ADDPREV,ADDSCEV,BOOKED,MISUSE
19,2,5,4,9,991,0,0,991,0,0,9,91,991,2,9,9,9,991,2,2,99,99,2,0
20,1,7,11,2,60,0,0,16,0,0,1,5,16,2,9,9,1,260,1,1,1,99,2,0
21,2,9,12,2,20,0,0,17,0,0,4,2,999,2,9,9,9,991,2,2,2,2,2,0
23,2,9,8,3,993,0,0,16,0,0,9,91,991,2,9,3,9,991,2,2,2,2,2,0
33,2,8,11,1,192,1,0,18,0,0,9,91,991,2,9,9,3,993,2,1,2,2,1,0
34,2,9,9,1,52,1,1,14,0,0,2,2,999,2,4,9,2,72,1,1,2,2,2,1
36,1,7,17,3,993,0,0,18,0,0,4,1,18,1,4,4,9,991,2,2,2,1,2,0
46,1,8,8,2,12,0,0,16,0,0,9,91,991,2,9,9,1,240,1,1,2,2,2,1
50,2,10,15,1,36,1,0,18,0,0,9,91,991,2,9,9,9,991,2,2,2,2,1,0
52,2,5,4,9,991,0,0,991,0,0,9,91,991,2,9,9,9,991,2,2,99,99,2,0


# 3. Feature Preprocessing

In [10]:
#Gender
odp.prepSubtractOne(df, 'IRSEX')
#IRSEX = {0:'Female', 1:'Male'}

#Smoking
# prepSubtractOne(df, 'CIGEVER')
# CIGEVER = {0:'No', 1:'Yes'}
df = odp.prepOneHot(df, 'IRCIGRC')
df = odp.prepOneHot(df, 'CIGDLYMO')

cutPoints = [0, 10, 13, 15, 17, 18, 19, 20, 22, 25, 30, 40, 50, 99, 985, 991, 994, 997, 998, 999]
df = odp.prepBin(df, 'CIGAGE', cutPoints)  #Age when smoked daily (also catches smoked/never smoked)
'''Codes:
2-99 = Age
985 = Bad Data
991 = Never Used Cigarettes
994 = Don't Know
997 = Refused
998 = Blank (no answer)
999 = Never smoked daily
'''

df = odp.prepOneHot(df, 'IRSMKLSSREC')
df = odp.prepOneHot(df, 'IRCGRRC')
df = odp.prepOneHot(df, 'PIPEVER')

#Weed
df = odp.prepOneHot(df, 'IRMJRC')

cutPoints = [0,1,2,3,7,10,20,30,40,50,100,150,200,250,365,985,991,993,994,997,998]
df = odp.prepBin(df, 'MJYRTOT', cutPoints)  #Days used weed in past year
'''Codes:
0-365 = Days
985 = Bad Data
991 = Never Used Weed
993 = Never used in past year
994 = Don't Know
997 = Refused
998 = Blank (no answer)
'''

odp.prepSubtractOne(df, 'FUMJ18')
odp.prepSubtractOne(df, 'FUMJ21')
#df = prepOneHot(df, 'DRVINMARJ2') #Not found in data

#Drugs (or Drugs + Alcohol)
#'TXYRRECVD2' #No action required, just including here for completeness
#'TXEVRRCVD2' #No action required, just including here for completeness

#Alcohol
df = odp.prepOneHot(df, 'IRALCRC')

cutPoints = [0,1,2,3,7,10,20,30,40,50,100,150,200,250,365,991,993]
df = odp.prepBin(df, 'IRALCFY', cutPoints)  #Days used in past year
'''Codes:
0-365 = Days
991 = Never Used Alc
993 = Never used in past year
'''

#'BNGDRKMON' #No action required, just including here for completeness
#'HVYDRKMON' #No action required, just including here for completeness

cutPoints = [0, 10, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 30, 40, 50, 100, 991]
df = odp.prepBin(df, 'IRALCAGE', cutPoints)  #First age used alcohol
'''Codes:
1-78 = Years
991 = Never Used Alc
'''

#Depression
df = odp.prepOneHot(df, 'ADDPREV')
df = odp.prepOneHot(df, 'ADDSCEV')

#Educaiton (one hot)
IREDUHIGHST2 = {1:5.0, 2:6.0, 3:7.0, 4:8.0, 5:9.0, 6:10.0, 7:11.0, 8:12.0, 9:14.0, 10:15.0, 11:16.0}
odp.prepRecode(df, 'IREDUHIGHST2', IREDUHIGHST2)
odp.prepScaleAndCenter(df, 'IREDUHIGHST2')

#Other
odp.prepRecode(df, 'BOOKED', {1:1,2:2,3:1,85:85,94:94,97:97,98:98})
df = odp.prepOneHot(df, 'BOOKED')

#Age
AGE2 = {1:12.0, 2:13.0, 3:14.0, 4:15.0, 5:16.0, 6:17.0, 7:18.0, 8:19.0, 9:20.0, 10:21.0,
        11:np.mean([22,23]), 12:np.mean([24,25]), 13:np.mean([26,29]), 14:np.mean([30,34]),
        15:np.mean([35,49]), 16:np.mean([50,64]), 17:70.0
       }
'''Note, category 17 is age 65+, so the value of 70 is somewhat arbitrary but reasonable.
Moreover, there are relatively few respondents of this age, making the choice minimally 
impactful.'''
odp.prepRecode(df, 'AGE2', AGE2)
odp.prepScaleAndCenter(df, 'AGE2')

#Abandoned dictionary method
# for k in colsToKeep:    
#     for func in colsToKeep[k]:
#         func(df, k)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  dfIn[colIn] = 2-dfIn[colIn]


Unnamed: 0,IRSEX,IREDUHIGHST2,AGE2,BNGDRKMON,HVYDRKMON,TXYRRECVD2,TXEVRRCVD2,FUMJ18,FUMJ21,MISUSE,IRCIGRC_2,IRCIGRC_3,IRCIGRC_4,IRCIGRC_9,CIGDLYMO_2,CIGDLYMO_5,CIGDLYMO_91,CIGDLYMO_94,CIGDLYMO_97,CIGAGE__GT10LTET13,CIGAGE__GT13LTET15,CIGAGE__GT15LTET17,CIGAGE__GT17LTET18,CIGAGE__GT18LTET19,CIGAGE__GT19LTET20,CIGAGE__GT20LTET22,CIGAGE__GT22LTET25,CIGAGE__GT25LTET30,CIGAGE__GT30LTET40,CIGAGE__GT40LTET50,CIGAGE__GT50LTET99,CIGAGE__GT99LTET985,CIGAGE__GT985LTET991,CIGAGE__GT991LTET994,CIGAGE__GT994LTET997,CIGAGE__GT997LTET998,CIGAGE__GT998LTET999,IRSMKLSSREC_2,IRSMKLSSREC_3,IRSMKLSSREC_4,IRSMKLSSREC_9,IRCGRRC_2,IRCGRRC_3,IRCGRRC_4,IRCGRRC_9,PIPEVER_2,PIPEVER_94,PIPEVER_97,IRMJRC_2,IRMJRC_3,IRMJRC_9,MJYRTOT__GT1LTET2,MJYRTOT__GT2LTET3,MJYRTOT__GT3LTET7,MJYRTOT__GT7LTET10,MJYRTOT__GT10LTET20,MJYRTOT__GT20LTET30,MJYRTOT__GT30LTET40,MJYRTOT__GT40LTET50,MJYRTOT__GT50LTET100,MJYRTOT__GT100LTET150,MJYRTOT__GT150LTET200,MJYRTOT__GT200LTET250,MJYRTOT__GT250LTET365,MJYRTOT__GT365LTET985,MJYRTOT__GT985LTET991,MJYRTOT__GT991LTET993,MJYRTOT__GT993LTET994,MJYRTOT__GT994LTET997,MJYRTOT__GT997LTET998,IRALCRC_2,IRALCRC_3,IRALCRC_9,IRALCFY__GT1LTET2,IRALCFY__GT2LTET3,IRALCFY__GT3LTET7,IRALCFY__GT7LTET10,IRALCFY__GT10LTET20,IRALCFY__GT20LTET30,IRALCFY__GT30LTET40,IRALCFY__GT40LTET50,IRALCFY__GT50LTET100,IRALCFY__GT100LTET150,IRALCFY__GT150LTET200,IRALCFY__GT200LTET250,IRALCFY__GT250LTET365,IRALCFY__GT365LTET991,IRALCFY__GT991LTET993,IRALCAGE__GT10LTET13,IRALCAGE__GT13LTET15,IRALCAGE__GT15LTET16,IRALCAGE__GT16LTET17,IRALCAGE__GT17LTET18,IRALCAGE__GT18LTET19,IRALCAGE__GT19LTET20,IRALCAGE__GT20LTET21,IRALCAGE__GT21LTET22,IRALCAGE__GT22LTET23,IRALCAGE__GT23LTET25,IRALCAGE__GT25LTET30,IRALCAGE__GT30LTET40,IRALCAGE__GT40LTET50,IRALCAGE__GT50LTET100,IRALCAGE__GT100LTET991,ADDPREV_2,ADDPREV_85,ADDPREV_94,ADDPREV_97,ADDPREV_98,ADDPREV_99,ADDSCEV_2,ADDSCEV_94,ADDSCEV_97,ADDSCEV_98,ADDSCEV_99,BOOKED_2,BOOKED_85,BOOKED_94,BOOKED_97,BOOKED_98
19,0,-1.407065,-1.188806,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0
20,1,-0.658529,-0.730225,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
21,0,0.464275,-0.607937,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
23,0,0.464275,-0.944230,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
33,0,-0.284261,-0.730225,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
34,0,0.464275,-0.883085,1,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
36,1,-0.658529,2.174124,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
46,1,-0.284261,-0.944230,0,0,0,0,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
50,0,0.838544,0.462087,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
52,0,-1.407065,-1.188806,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0


In [11]:
### DO NOT RUN THIS CELL FOR ACTUAL MODEL DEVELOPMENT ###
### MAKE SURE fakeGreatPredictor = False ###
'''This cell createS a close-to-perfect predictor of our outcome variable MISUSE.
It copies MISUSE into a new column, M2, and then randomly changes a few rows.
Making this great predictor allows testing of the subsequent model to ensure
it is training properly.
'''
fakeGreatPredictor = False

if fakeGreatPredictor:
    df['M2'] = df['MISUSE'] #Create PERFECT predictor called M2

    dfLen = len(df)
    
    dfupdate=df.sample(int(dfLen*0.1)) #Randomly change 10% of rows to 0
    dfupdate.M2=0
    df.update(dfupdate)
    
    dfupdate=df.sample(int(dfLen*0.1)) #Randomly change 10% of rows to 1
    dfupdate.M2=1
    df.update(dfupdate)    

df

Unnamed: 0,IRSEX,IREDUHIGHST2,AGE2,BNGDRKMON,HVYDRKMON,TXYRRECVD2,TXEVRRCVD2,FUMJ18,FUMJ21,MISUSE,IRCIGRC_2,IRCIGRC_3,IRCIGRC_4,IRCIGRC_9,CIGDLYMO_2,CIGDLYMO_5,CIGDLYMO_91,CIGDLYMO_94,CIGDLYMO_97,CIGAGE__GT10LTET13,CIGAGE__GT13LTET15,CIGAGE__GT15LTET17,CIGAGE__GT17LTET18,CIGAGE__GT18LTET19,CIGAGE__GT19LTET20,CIGAGE__GT20LTET22,CIGAGE__GT22LTET25,CIGAGE__GT25LTET30,CIGAGE__GT30LTET40,CIGAGE__GT40LTET50,CIGAGE__GT50LTET99,CIGAGE__GT99LTET985,CIGAGE__GT985LTET991,CIGAGE__GT991LTET994,CIGAGE__GT994LTET997,CIGAGE__GT997LTET998,CIGAGE__GT998LTET999,IRSMKLSSREC_2,IRSMKLSSREC_3,IRSMKLSSREC_4,IRSMKLSSREC_9,IRCGRRC_2,IRCGRRC_3,IRCGRRC_4,IRCGRRC_9,PIPEVER_2,PIPEVER_94,PIPEVER_97,IRMJRC_2,IRMJRC_3,IRMJRC_9,MJYRTOT__GT1LTET2,MJYRTOT__GT2LTET3,MJYRTOT__GT3LTET7,MJYRTOT__GT7LTET10,MJYRTOT__GT10LTET20,MJYRTOT__GT20LTET30,MJYRTOT__GT30LTET40,MJYRTOT__GT40LTET50,MJYRTOT__GT50LTET100,MJYRTOT__GT100LTET150,MJYRTOT__GT150LTET200,MJYRTOT__GT200LTET250,MJYRTOT__GT250LTET365,MJYRTOT__GT365LTET985,MJYRTOT__GT985LTET991,MJYRTOT__GT991LTET993,MJYRTOT__GT993LTET994,MJYRTOT__GT994LTET997,MJYRTOT__GT997LTET998,IRALCRC_2,IRALCRC_3,IRALCRC_9,IRALCFY__GT1LTET2,IRALCFY__GT2LTET3,IRALCFY__GT3LTET7,IRALCFY__GT7LTET10,IRALCFY__GT10LTET20,IRALCFY__GT20LTET30,IRALCFY__GT30LTET40,IRALCFY__GT40LTET50,IRALCFY__GT50LTET100,IRALCFY__GT100LTET150,IRALCFY__GT150LTET200,IRALCFY__GT200LTET250,IRALCFY__GT250LTET365,IRALCFY__GT365LTET991,IRALCFY__GT991LTET993,IRALCAGE__GT10LTET13,IRALCAGE__GT13LTET15,IRALCAGE__GT15LTET16,IRALCAGE__GT16LTET17,IRALCAGE__GT17LTET18,IRALCAGE__GT18LTET19,IRALCAGE__GT19LTET20,IRALCAGE__GT20LTET21,IRALCAGE__GT21LTET22,IRALCAGE__GT22LTET23,IRALCAGE__GT23LTET25,IRALCAGE__GT25LTET30,IRALCAGE__GT30LTET40,IRALCAGE__GT40LTET50,IRALCAGE__GT50LTET100,IRALCAGE__GT100LTET991,ADDPREV_2,ADDPREV_85,ADDPREV_94,ADDPREV_97,ADDPREV_98,ADDPREV_99,ADDSCEV_2,ADDSCEV_94,ADDSCEV_97,ADDSCEV_98,ADDSCEV_99,BOOKED_2,BOOKED_85,BOOKED_94,BOOKED_97,BOOKED_98
19,0,-1.407065,-1.188806,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0
20,1,-0.658529,-0.730225,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
21,0,0.464275,-0.607937,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
23,0,0.464275,-0.944230,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
33,0,-0.284261,-0.730225,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
34,0,0.464275,-0.883085,1,1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
36,1,-0.658529,2.174124,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
46,1,-0.284261,-0.944230,0,0,0,0,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
50,0,0.838544,0.462087,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
52,0,-1.407065,-1.188806,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0


# 4. Save to Pickle

In [12]:
#Save to pickle, for loading in the modeling (training) file
df.to_pickle(dataDir+'features.pickle.zip')
'''Note, the .to_pickle command INFERS zip compression based on the '.zip'
extension. Changing the extension will result in a 1GB file instead of a 
compressed file.
''';