# Predicting Days Before Lapse for Marijuana Drug Abusers

In [32]:
import pandas as pd

def load_data(url):
    # read csv from github url
    # return pandas dataframe
    df = pd.read_csv(url)
    return df

url = 'https://raw.githubusercontent.com/prathikr/CS_499_Final_Project/master/BISTRA_GROUP_PROJECT_SMALL.csv'
df = load_data(url)
df.head(5)

Unnamed: 0,ID,female,nonwhite,unemplmt,primsev,B2a_0,State,City,zipcode,noins,...,Any_Cens,Alcohol_Cens,Binge_Cens,Marijuana_Cens,Illicit_Cens,Any_Days,Alcohol_Days,Binge_Days,Marijuana_Days,Illicit_Days
0,9048,0,0,0,4,17,VA,Lynchburg,24501-1114,0,...,2,2,2,2,2,192,192,192,192,192
1,9152,0,0,0,1,18,VA,Lynchburg,24501-1114,0,...,2,2,2,2,2,176,176,176,176,176
2,9057,0,0,1,3,16,VA,Lynchburg,24501-1114,0,...,0,0,2,0,2,21,21,365,81,365
3,9139,0,0,0,3,16,VA,Lynchburg,24501-1114,0,...,0,2,0,0,2,20,365,0,20,365
4,9051,0,1,0,3,17,VA,Lynchburg,24501-1114,0,...,0,0,0,0,1,14,30,30,14,356


In [33]:
# ** Describe Data??

In [34]:
def trim_predictors(df, predictor_col_name, cols_to_drop):
    # extract nx1 vector for predictor and drop all other potential predictors
    # display(df.columns)
    print("Original df:", df.shape)
    Y = df[predictor_col_name]
    print("Predictor column:", Y.shape)
    df.drop(columns=cols_to_drop, inplace=True)
    print("Post-extracting predictor column and removing other predictors:", df.shape)
    # return extracted predictor values
    return Y

Y = trim_predictors(df, 'Marijuana_Days', ['State', 'City', 'zipcode', 'agyaddr', 'SFS8p_0', 'SFS8p_3', 'SFS8p_6', 
'SFS8p_12', 'ada_0','ada_3','ada_6','ada_12','S2c1_0','S2c1_3','S2c1_6','S2c1_12','S2b1_0','S2b1_3','S2b1_6',
'S2b1_12','S2z1_3','S2z1_6','S2z1_12','S2z2_3','S2z2_6','S2z2_12','S2z3_3','S2z3_6','S2z3_12','S2z4_3','S2z4_6',
'S2z4_12','S2z5_3','S2z5_6','S2z5_12','Any_Cens','Alcohol_Cens','Binge_Cens','Marijuana_Cens','Illicit_Cens',
'Any_Days','Binge_Days','Alcohol_Days','Illicit_Days', 'Marijuana_Days'])

Original df: (26556, 110)
Predictor column: (26556,)
Post-extracting predictor column and removing other predictors: (26556, 65)


In [35]:
import numpy as np
from IPython.display import display_html

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def drop_NaN_columns(df):
    # replace all -999 with NaN inplace
    df.replace(to_replace = -999, value = np.nan, inplace = True)
    
    # calculate percentage of NaNs in each column
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
    third = int(round(len(missing_value_df) / 3))
    display_side_by_side(missing_value_df[0:third], missing_value_df[third:third*2], missing_value_df[third*2:len(missing_value_df)])
    
    # drop columns with > 25% inplace
    cols = []
    for index, row in missing_value_df.iterrows():
        if row['percent_missing'] > 25:
          cols.append(row['column_name'])
    df.drop(columns=cols, inplace=True)
    print("columns dropped:", cols)
    print("new df shape:", df.shape)
    
drop_NaN_columns(df)

Unnamed: 0,column_name,percent_missing
ID,ID,0.0
female,female,0.003766
nonwhite,nonwhite,0.030125
unemplmt,unemplmt,0.176984
primsev,primsev,0.015063
B2a_0,B2a_0,0.0
noins,noins,57.327911
prsatx,prsatx,0.30125
tottxp4,tottxp4,0.0
TRI_0,TRI_0,1.276548

Unnamed: 0,column_name,percent_missing
E9m,E9m,0.62886
tsd_0,tsd_0,0.222172
und15,und15,3.117939
CWS_0,CWS_0,0.173219
IPI,IPI,20.59045
L5,L5,1.001657
SPSy_0,SPSy_0,0.037656
RFQ33c,RFQ33c,40.60476
GSSI_0,GSSI_0,51.973189
S9y10,S9y10,0.274891

Unnamed: 0,column_name,percent_missing
PSSI_0,PSSI_0,51.976954
RERI13p_0,RERI13p_0,56.096551
ncar,ncar,3.279861
loc,loc,0.048953
engage30,engage30,8.352161
engage42,engage42,4.258925
init,init,0.0
S7e4_0,S7e4_0,1.012954
POPIgrp,POPIgrp,72.672842
FIS4p_0,FIS4p_0,54.838831


columns dropped: ['noins', 'E9e', 'RFQ33c', 'GSSI_0', 'press', 'PSSI_0', 'RERI13p_0', 'POPIgrp', 'FIS4p_0']
new df shape: (26556, 56)


In [36]:
def fill_NaN_columns(df):
    # replace NaNs with mean/mode inplace
    final_df = df.fillna(df.mean()).fillna(df.mode().iloc[0])
    percent_missing = final_df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': final_df.columns,'percent_missing': percent_missing})
    third = int(round(len(missing_value_df) / 3))
    display_side_by_side(missing_value_df[0:third], missing_value_df[third:third*2], missing_value_df[third*2:len(missing_value_df)])
    print("df shape (should be same as previous cell):", final_df.shape)
    return final_df

final_df = fill_NaN_columns(df)

Unnamed: 0,column_name,percent_missing
ID,ID,0.0
female,female,0.0
nonwhite,nonwhite,0.0
unemplmt,unemplmt,0.0
primsev,primsev,0.0
B2a_0,B2a_0,0.0
prsatx,prsatx,0.0
tottxp4,tottxp4,0.0
TRI_0,TRI_0,0.0
GVS,GVS,0.0

Unnamed: 0,column_name,percent_missing
E9k,E9k,0.0
E9m,E9m,0.0
tsd_0,tsd_0,0.0
und15,und15,0.0
CWS_0,CWS_0,0.0
IPI,IPI,0.0
L5,L5,0.0
SPSy_0,SPSy_0,0.0
S9y10,S9y10,0.0
dldiag,dldiag,0.0

Unnamed: 0,column_name,percent_missing
S6,S6,0.0
ncar,ncar,0.0
loc,loc,0.0
engage30,engage30,0.0
engage42,engage42,0.0
init,init,0.0
S7e4_0,S7e4_0,0.0
HIVrisk,HIVrisk,0.0
txtypeg,txtypeg,0.0
SDScrY,SDScrY,0.0


df shape (should be same as previous cell): (26556, 56)


In [37]:
# split data into train and test
# drop ID column but save it for post-model labelling

from sklearn.model_selection import train_test_split

Xtr, Xte, Ytr, Yte = train_test_split(final_df, Y, test_size=0.25, random_state=17)
Xte_IDs = Xte['ID'].copy()
Xtr.drop(columns=['ID'], inplace=True)
Xte.drop(columns=['ID'], inplace=True)

print("Xtr:", Xtr.shape)
print("Ytr:", Ytr.shape)
print("Xte:", Xte.shape)
print("Yte:", Yte.shape)

Xtr: (19917, 55)
Ytr: (19917,)
Xte: (6639, 55)
Yte: (6639,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [None]:
# train regression model
# examine coefficients and drop columns with coefficient close to 0
# retrain regression model and print results

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from pandas import DataFrame
import matplotlib.pyplot as plt

poly_reg = PolynomialFeatures(degree=3)
X_poly = poly_reg.fit_transform(Xtr)
pol_reg = LinearRegression()
pol_reg.fit(X_poly, Ytr)

predictions = DataFrame(pol_reg.predict(poly_reg.fit_transform(Xte)))

y_test = pd.concat([Xte_IDs, Yte], axis=1, sort=True).dropna()
y_pred = pd.concat([Xte_IDs, predictions], axis=1, sort=True).dropna()

y_test = y_test[:len(y_pred)]

print(y_pred.shape)
print(y_test.shape)

display_side_by_side(y_pred.head(), y_test.head())

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

"""
coefficients = pd.DataFrame({'Coefficients': poly.coef_})
columns = pd.DataFrame({'column_name': Xtr.columns})

combined = pd.DataFrame({'Coefficients': poly.coef_, 'column_name': Xtr.columns})
quarter = int(round(len(combined) / 4))
display_side_by_side(combined[0:quarter], combined[quarter:quarter*2], combined[quarter*2:quarter*3], combined[quarter*3:len(combined)])

# drop columns with -1 < x < 1 coefficient inplace
cols = []
for index, row in combined.iterrows():
    if row['Coefficients'] > -1 and row['Coefficients'] < 1:
      cols.append(row['column_name'])
Xtr.drop(columns=cols, inplace=True)
Xte.drop(columns=cols, inplace=True)

poly.fit(Xtr, Ytr)

predictions = pd.DataFrame({'Marijuana_Days': poly.predict(Xte)})
Yte = pd.DataFrame(Yte.dropna())

y_test = pd.concat([Xte_IDs, Yte], axis=1, sort=True).dropna()
y_pred = pd.concat([Xte_IDs, predictions], axis=1, sort=True).dropna()

y_test = y_test[:len(y_pred)]

print(y_pred.shape)
print(y_test.shape)

display_side_by_side(y_pred.head(), y_test.head())

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
"""

In [None]:
print('done!')