# Predicting Days Before Lapse for Marijuana Drug Abusers

In [8]:
import pandas as pd

def load_data(url):
    # read csv from github url
    # return pandas dataframe
    df = pd.read_csv(url)
    return df

url = 'https://raw.githubusercontent.com/prathikr/CS_499_Final_Project/master/BISTRA_GROUP_PROJECT_SMALL.csv'
df = load_data(url)
df.head(5)

Unnamed: 0,ID,female,nonwhite,unemplmt,primsev,B2a_0,State,City,zipcode,noins,...,Any_Cens,Alcohol_Cens,Binge_Cens,Marijuana_Cens,Illicit_Cens,Any_Days,Alcohol_Days,Binge_Days,Marijuana_Days,Illicit_Days
0,9048,0,0,0,4,17,VA,Lynchburg,24501-1114,0,...,2,2,2,2,2,192,192,192,192,192
1,9152,0,0,0,1,18,VA,Lynchburg,24501-1114,0,...,2,2,2,2,2,176,176,176,176,176
2,9057,0,0,1,3,16,VA,Lynchburg,24501-1114,0,...,0,0,2,0,2,21,21,365,81,365
3,9139,0,0,0,3,16,VA,Lynchburg,24501-1114,0,...,0,2,0,0,2,20,365,0,20,365
4,9051,0,1,0,3,17,VA,Lynchburg,24501-1114,0,...,0,0,0,0,1,14,30,30,14,356


In [9]:
df = df[df.Marijuana_Days != -999] # removes all rows with Marijuana_Days = -999
df.hist(column='Marijuana_Days', bins=21) # 21 chosen so that there would be 3 bins between each x-axis number

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x115c5f6a0>]],
      dtype=object)

In [10]:
def trim_predictors(df, predictor_col_name, other_cols_to_drop):
    # extract nx1 vector for predictor and drop all other potential predictors
    # display(df.columns)
    print("Original df:", df.shape)
    Y = df[predictor_col_name]
    print("Predictor column:", Y.shape)
    df.drop(columns=[predictor_col_name], inplace=True)
    df.drop(columns=other_cols_to_drop, inplace=True)
    print("Post-extracting predictor column:", df.shape)
    # return extracted predictor values
    return Y

Y = trim_predictors(df, 'Marijuana_Days', ['State', 'City', 'zipcode', 'agyaddr', 'SFS8p_0', 'SFS8p_3', 'SFS8p_6', 
'SFS8p_12', 'ada_0','ada_3','ada_6','ada_12','S2c1_0','S2c1_3','S2c1_6','S2c1_12','S2b1_0','S2b1_3','S2b1_6',
'S2b1_12','S2z1_3','S2z1_6','S2z1_12','S2z2_3','S2z2_6','S2z2_12','S2z3_3','S2z3_6','S2z3_12','S2z4_3','S2z4_6',
'S2z4_12','S2z5_3','S2z5_6','S2z5_12','Any_Cens','Alcohol_Cens','Binge_Cens','Marijuana_Cens','Illicit_Cens',
'Any_Days','Binge_Days','Alcohol_Days','Illicit_Days'])

Original df: (26490, 110)
Predictor column: (26490,)
Post-extracting predictor column: (26490, 65)


In [11]:
import numpy as np
from IPython.display import display_html

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def drop_NaN_columns(df):
    # replace all -999 with NaN inplace
    df.replace(to_replace = -999, value = np.nan, inplace = True)
    
    # calculate percentage of NaNs in each column
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
    display_side_by_side(missing_value_df[0:23], missing_value_df[23:46], missing_value_df[46:69])
    
    # drop columns with > 25% inplace
    cols = []
    for index, row in missing_value_df.iterrows():
        if row['percent_missing'] > 25:
          cols.append(row['column_name'])
    df.drop(columns=cols, inplace=True)
    print("columns dropped:", cols)
    
drop_NaN_columns(df)

Unnamed: 0,column_name,percent_missing
ID,ID,0.0
female,female,0.003775
nonwhite,nonwhite,0.0302
unemplmt,unemplmt,0.177425
primsev,primsev,0.0151
B2a_0,B2a_0,0.0
noins,noins,57.372593
prsatx,prsatx,0.302001
tottxp4,tottxp4,0.0
TRI_0,TRI_0,1.279728

Unnamed: 0,column_name,percent_missing
tsd_0,tsd_0,0.222726
und15,und15,3.106833
CWS_0,CWS_0,0.169875
IPI,IPI,20.630427
L5,L5,0.996602
SPSy_0,SPSy_0,0.03775
RFQ33c,RFQ33c,40.607777
GSSI_0,GSSI_0,52.008305
S9y10,S9y10,0.275576
dldiag,dldiag,21.683654

Unnamed: 0,column_name,percent_missing
ncar,ncar,3.284258
loc,loc,0.049075
engage30,engage30,8.316346
engage42,engage42,4.258211
init,init,0.0
S7e4_0,S7e4_0,1.015478
POPIgrp,POPIgrp,72.676482
FIS4p_0,FIS4p_0,54.877312
HIVrisk,HIVrisk,0.041525
txtypeg,txtypeg,3.050208


columns dropped: ['noins', 'E9e', 'RFQ33c', 'GSSI_0', 'press', 'PSSI_0', 'RERI13p_0', 'POPIgrp', 'FIS4p_0']


In [12]:
def fill_NaN_columns(df):
    # replace NaNs with mean/mode inplace
    final_df = df.fillna(df.mean()).fillna(df.mode().iloc[0])
    percent_missing = final_df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': final_df.columns,'percent_missing': percent_missing})
    display_side_by_side(missing_value_df[0:23], missing_value_df[23:46], missing_value_df[46:69])
    return final_df

final_df = fill_NaN_columns(df)

Unnamed: 0,column_name,percent_missing
ID,ID,0.0
female,female,0.0
nonwhite,nonwhite,0.0
unemplmt,unemplmt,0.0
primsev,primsev,0.0
B2a_0,B2a_0,0.0
prsatx,prsatx,0.0
tottxp4,tottxp4,0.0
TRI_0,TRI_0,0.0
GVS,GVS,0.0

Unnamed: 0,column_name,percent_missing
CWS_0,CWS_0,0.0
IPI,IPI,0.0
L5,L5,0.0
SPSy_0,SPSy_0,0.0
S9y10,S9y10,0.0
dldiag,dldiag,0.0
DSS9_0,DSS9_0,0.0
ADHDs_0,ADHDs_0,0.0
CDS_0,CDS_0,0.0
suicprbs_0,suicprbs_0,0.0

Unnamed: 0,column_name,percent_missing
txtypeg,txtypeg,0.0
SDScrY,SDScrY,0.0
totttld,totttld,0.0
POS_0,POS_0,0.0
AFSS_0,AFSS_0,0.0
S2x_0,S2x_0,0.0
SPSm_0,SPSm_0,0.0
E14a_0,E14a_0,0.0
E14b_0,E14b_0,0.0
EPS7p_0,EPS7p_0,0.0


In [13]:
from sklearn.model_selection import train_test_split

Xtr, Xte, Ytr, Yte = train_test_split(final_df, Y, test_size=0.25, random_state=17)
Xte_IDs = Xte[['ID']]
Xtr.drop(columns=['ID'], inplace=True)
Xte.drop(columns=['ID'], inplace=True)

print("Xtr:", Xtr.shape)
print("Ytr:", Ytr.shape)
print("Xte:", Xte.shape)
print("Yte:", Yte.shape)

Xtr: (19867, 55)
Ytr: (19867,)
Xte: (6623, 55)
Yte: (6623,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

lm = LinearRegression()
model = lm.fit(Xtr, Ytr)

coefficients = pd.DataFrame(model.coef_)
columns = pd.DataFrame(Xtr.columns)

display_side_by_side(coefficients[0:18], columns[0:18], coefficients[18:36], columns[18:36], coefficients[36:54], columns[36:54])

predictions = pd.DataFrame({'Marijuana_Days': model.predict(Xte)})
Yte = pd.DataFrame(Yte.dropna())

y_test = pd.concat([Xte_IDs, Yte], axis=1, sort=True).dropna()
y_pred = pd.concat([Xte_IDs, predictions], axis=1, sort=True).dropna()

y_test = y_test[:len(y_pred)]

print(y_pred.shape)
print(y_test.shape)

display_side_by_side(y_pred.head(), y_test.head())

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

Unnamed: 0,0
0,17.590994
1,1.241519
2,8.703213
3,-7.759909
4,2.686975
5,-6.477101
6,-1.088483
7,-5.034777
8,1.422846
9,1.762555

Unnamed: 0,0
0,female
1,nonwhite
2,unemplmt
3,primsev
4,B2a_0
5,prsatx
6,tottxp4
7,TRI_0
8,GVS
9,E9a

Unnamed: 0,0
18,-3.086522
19,-1.688059
20,2.188176
21,-9.365809
22,0.155887
23,-1.124234
24,-0.094334
25,3.147978
26,0.26939
27,-19.723836

Unnamed: 0,0
18,E9k
19,E9m
20,tsd_0
21,und15
22,CWS_0
23,IPI
24,L5
25,SPSy_0
26,S9y10
27,dldiag

Unnamed: 0,0
36,-3.683775
37,3.892953
38,8.254942
39,-0.208618
40,5.487986
41,-7.587018
42,-3.406181
43,-0.038228
44,-0.527823
45,5.82121

Unnamed: 0,0
36,homeless_0
37,S6
38,ncar
39,loc
40,engage30
41,engage42
42,init
43,S7e4_0
44,HIVrisk
45,txtypeg


(1655, 2)
(1655, 2)


Unnamed: 0,ID,Marijuana_Days
1,9152.0,154.655159
6,12718.0,159.896733
9,12693.0,150.97157
12,12674.0,142.239476
13,12729.0,181.346153

Unnamed: 0,ID,Marijuana_Days
1,9152,176
6,12718,365
9,12693,358
12,12674,357
13,12729,170


Mean Absolute Error: 58.75561114000929
