# Predicting Days Before Lapse for Marijuana Drug Abusers

In [None]:
import pandas as pd

def load_data(url):
    # read csv from github url
    # return pandas dataframe
    df = pd.read_csv(url)
    return df

url = 'https://raw.githubusercontent.com/prathikr/CS_499_Final_Project/master/BISTRA_GROUP_PROJECT_SMALL.csv'
df = load_data(url)
df.head(5)

In [None]:
# ** Describe Data??

In [None]:
def trim_predictors(df, predictor_col_name, other_cols_to_drop):
    # extract nx1 vector for predictor and drop all other potential predictors
    # display(df.columns)
    print("Original df:", df.shape)
    Y = df[predictor_col_name]
    print("Predictor column:", Y.shape)
    df.drop(columns=[predictor_col_name], inplace=True)
    df.drop(columns=other_cols_to_drop, inplace=True)
    print("Post-extracting predictor column:", df.shape)
    # return extracted predictor values
    return Y

Y = trim_predictors(df, 'Marijuana_Days', ['State', 'City', 'zipcode', 'agyaddr', 'SFS8p_0', 'SFS8p_3', 'SFS8p_6', 
'SFS8p_12', 'ada_0','ada_3','ada_6','ada_12','S2c1_0','S2c1_3','S2c1_6','S2c1_12','S2b1_0','S2b1_3','S2b1_6',
'S2b1_12','S2z1_3','S2z1_6','S2z1_12','S2z2_3','S2z2_6','S2z2_12','S2z3_3','S2z3_6','S2z3_12','S2z4_3','S2z4_6',
'S2z4_12','S2z5_3','S2z5_6','S2z5_12','Any_Cens','Alcohol_Cens','Binge_Cens','Marijuana_Cens','Illicit_Cens',
'Any_Days','Binge_Days','Alcohol_Days','Illicit_Days'])

In [None]:
import numpy as np
from IPython.display import display_html

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def drop_NaN_columns(df):
    # replace all -999 with NaN inplace
    df.replace(to_replace = -999, value = np.nan, inplace = True)
    
    # calculate percentage of NaNs in each column
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
    display_side_by_side(missing_value_df[0:23], missing_value_df[23:46], missing_value_df[46:69])
    
    # drop columns with > 25% inplace
    cols = []
    for index, row in missing_value_df.iterrows():
        if row['percent_missing'] > 25:
          cols.append(row['column_name'])
    df.drop(columns=cols, inplace=True)
    print("columns dropped:", cols)
    
drop_NaN_columns(df)

In [None]:
def fill_NaN_columns(df):
    # replace NaNs with mean/mode inplace
    final_df = df.fillna(df.mean()).fillna(df.mode().iloc[0])
    percent_missing = final_df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': final_df.columns,'percent_missing': percent_missing})
    display_side_by_side(missing_value_df[0:23], missing_value_df[23:46], missing_value_df[46:69])
    return final_df

final_df = fill_NaN_columns(df)

In [None]:
# normalize columns
from sklearn import preprocessing

cols = ['tottxp4', 'CWS_0', 'ADHDs_0', 'CDS_0', 'CJSI_0', 'EPS7p_0', 'LRI7_0', 'SRI7_0', 
          'ERS21_0', 'HIVrisk', 'totttld', 'S2x_0', 'SPSm_0']

df.hist('tottxp4')

scaler = preprocessing.StandardScaler()
for i in cols:
    df[i] = scaler.fit_transform(df[[i]])
    
df.hist('tottxp4')

In [None]:
from sklearn.model_selection import train_test_split

Xtr, Xte, Ytr, Yte = train_test_split(final_df, Y, test_size=0.25, random_state=17)
Xte_IDs = Xte[['ID']]
Xtr.drop(columns=['ID'], inplace=True)
Xte.drop(columns=['ID'], inplace=True)

print("Xtr:", Xtr.shape)
print("Ytr:", Ytr.shape)
print("Xte:", Xte.shape)
print("Yte:", Yte.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics

lm = LinearRegression()
model = lm.fit(Xtr, Ytr)

coefficients = pd.DataFrame(model.coef_)
columns = pd.DataFrame(Xtr.columns)

display_side_by_side(coefficients[0:18], columns[0:18], coefficients[18:36], columns[18:36], coefficients[36:54], columns[36:54])

predictions = pd.DataFrame({'Marijuana_Days': model.predict(Xte)})
Yte = pd.DataFrame(Yte.dropna())

y_test = pd.concat([Xte_IDs, Yte], axis=1, sort=True).dropna()
y_pred = pd.concat([Xte_IDs, predictions], axis=1, sort=True).dropna()

y_test = y_test[:len(y_pred)]

print(y_pred.shape)
print(y_test.shape)

display_side_by_side(y_pred.head(), y_test.head())

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))