# Predicting Days Before Lapse for Marijuana Drug Abusers

In [None]:
import pandas as pd

def load_data(url):
    # read csv from github url
    # return pandas dataframe
    df = pd.read_csv(url)
    return df

url = 'https://raw.githubusercontent.com/prathikr/CS_499_Final_Project/master/BISTRA_GROUP_PROJECT_SMALL.csv'
df = load_data(url)
df.head(5)

In [None]:
# ** Describe Data??

In [None]:
def trim_predictors(df, predictor_col_name, cols_to_drop):
    # extract nx1 vector for predictor and drop all other potential predictors
    # display(df.columns)
    print("Original df:", df.shape)
    Y = df[predictor_col_name]
    print("Predictor column:", Y.shape)
    df.drop(columns=cols_to_drop, inplace=True) # gotta keep inplace=True here or else code breaks!! idk why...
    df = df[df.primsev != 3] # leaves only marijuana drug abusers in dataframe
    print("Post-extracting predictor column and removing other predictors:", df.shape)
    # return extracted predictor values
    return Y

Y = trim_predictors(df, 'Marijuana_Days', ['State', 'City', 'zipcode', 'agyaddr', 'SFS8p_0', 'SFS8p_3', 'SFS8p_6', 
'SFS8p_12', 'ada_0','ada_3','ada_6','ada_12','S2c1_0','S2c1_3','S2c1_6','S2c1_12','S2b1_0','S2b1_3','S2b1_6',
'S2b1_12','S2z1_3','S2z1_6','S2z1_12','S2z2_3','S2z2_6','S2z2_12','S2z3_3','S2z3_6','S2z3_12','S2z4_3','S2z4_6',
'S2z4_12','S2z5_3','S2z5_6','S2z5_12','Any_Cens','Alcohol_Cens','Binge_Cens','Marijuana_Cens','Illicit_Cens',
'Any_Days','Binge_Days','Alcohol_Days','Illicit_Days', 'Marijuana_Days'])
print(df.columns)

In [None]:
import numpy as np
from IPython.display import display_html

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def drop_NaN_columns(df):
    # replace all -999 with NaN inplace
    df = df.replace(to_replace = -999, value = np.nan)
    
    # calculate percentage of NaNs in each column
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
    third = int(round(len(missing_value_df) / 3))
    display_side_by_side(missing_value_df[0:third], missing_value_df[third:third*2], missing_value_df[third*2:len(missing_value_df)])
    
    # drop columns with > 25% inplace
    cols = []
    for index, row in missing_value_df.iterrows():
        if row['percent_missing'] > 25:
          cols.append(row['column_name'])
    df = df.drop(columns=cols)
    print("columns dropped:", cols)
    print("new df shape:", df.shape)
    
drop_NaN_columns(df)

In [None]:
def fill_NaN_columns(df):
    # replace NaNs with mean/mode inplace
    final_df = df.fillna(df.mean()).fillna(df.mode().iloc[0])
    percent_missing = final_df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': final_df.columns,'percent_missing': percent_missing})
    third = int(round(len(missing_value_df) / 3))
    display_side_by_side(missing_value_df[0:third], missing_value_df[third:third*2], missing_value_df[third*2:len(missing_value_df)])
    print("df shape (should be same as previous cell):", final_df.shape)
    return final_df

df = fill_NaN_columns(df)

In [None]:
def data_preprocessing(df):
    print(df.describe())
    
final_df = data_preprocessing(df)

In [None]:
# split data into train and test
# drop ID column but save it for post-model labelling

from sklearn.model_selection import train_test_split

Xtr, Xte, Ytr, Yte = train_test_split(final_df, Y, test_size=0.25, random_state=17)
Xte_IDs = Xte['ID'].copy()
Xtr = Xtr.drop(columns=['ID'])
Xte = Xte.drop(columns=['ID'])

print("Xtr:", Xtr.shape)
print("Ytr:", Ytr.shape)
print("Xte:", Xte.shape)
print("Yte:", Yte.shape)

In [None]:
# train regression model
# examine coefficients and drop columns with coefficient close to 0
# retrain regression model and print results
"""
from sklearn.linear_model import LinearRegression
from sklearn import metrics

import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

lm = LinearRegression()
model = lm.fit(Xtr, Ytr)

coefficients = pd.DataFrame({'Coefficients': model.coef_})
columns = pd.DataFrame({'column_name': Xtr.columns})

combined = pd.DataFrame({'Coefficients': model.coef_, 'column_name': Xtr.columns})
print('FEATURES IN ORDER OF HIGHEST IMPACT ON MODEL...')
combined = combined.reindex(combined.Coefficients.abs().sort_values().index).iloc[::-1]
quarter = int(round(len(combined) / 4))
display_side_by_side(combined[0:quarter], combined[quarter:quarter*2], combined[quarter*2:quarter*3], combined[quarter*3:len(combined)])

# drop columns with -1 < x < 1 coefficient inplace
cols = []
for index, row in combined.iterrows():
    if row['Coefficients'] > -1 and row['Coefficients'] < 1:
      cols.append(row['column_name'])
      
for i in cols:
    combined = combined[combined.column_name != i]

Xtr = Xtr.drop(columns=cols)
Xte = Xte.drop(columns=cols)

model = lm.fit(Xtr, Ytr)

predictions = pd.DataFrame({'Marijuana_Days': model.predict(Xte)})
Yte = pd.DataFrame(Yte.dropna())

y_test = pd.concat([Xte_IDs, Yte], axis=1, sort=True).dropna()
y_pred = pd.concat([Xte_IDs, predictions], axis=1, sort=True).dropna()

y_test = y_test[:len(y_pred)]

print(y_pred.shape)
print(y_test.shape)

combined = combined.reindex(combined.Coefficients.abs().sort_values().index).iloc[::-1]
quarter = int(round(len(combined) / 4))
print('FEATURES *** WITH COEFFICIENT -1<X<1 *** IN ORDER OF HIGHEST IMPACT ON MODEL...')
display_side_by_side(combined[0:quarter].copy(), combined[quarter:quarter*2], combined[quarter*2:quarter*3], combined[quarter*3:len(combined)])
print(combined.shape)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

# trim to features that actually make significant difference, feed to NN
cols = []
for index, row in combined.iterrows():
    cols.append(row['column_name'])
Xtr.drop(cols, axis=1)
Xte.drop(cols, axis=1)

# define base model
def baseline_model():
    # create model
    model = Sequential()
    print("# of features:", combined.shape[0])
    model.add(Dense(combined.shape[0], input_dim=combined.shape[0], kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

kfold = KFold(n_splits=3)
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)
results = cross_val_score(estimator, Xtr, Ytr, cv=kfold)
print("Results: %.2f (%.2f) MAE" % (results.mean(), results.std()))
"""