## 0 - Imports

In [1]:
%load_ext autoreload
%autoreload

In [56]:
import scipy.io as spio
import pandas as pd
import numpy as np
from numpy import absolute as nabs
import matplotlib.pyplot as plt
#from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.arima_model import ARIMA
from sklearn.linear_model import LassoCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error as mse
#from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Dense
from numpy.linalg import LinAlgError
from datetime import datetime
import warnings

## 1 - Load, clean, test data

In [4]:
def impmat(fname = 'M_processed.mat', writ = True):
    ''' import matlab crap, and turn it to pickles (or return panda df)'''
    mat = spio.loadmat(fname, squeeze_me=True)
    M = mat['M'] 
    head = ['time','ndc1','ndc2','ndc3',
            'Trade_Partner_Name', 'Distribution_Center_State','NDC','Distribution_Center_ID_(IC)',
    'Distribution_Center_Zip','Eff_Inv_(EU)','Eff_Inv_(PU)','Qty_Ord_(EU)',
            'Qty_Ord_(PU)']
    # get rid of ndc 1,2,3 because they're pieces of NCD
    # also get rid of purchase units, just use eatable
    # also get rid of states and zip code
    needed = [0,4,6,7,9,11]
    head_adj = [head[i] for i in needed] + ["year", "month", "week"]
    data = pd.DataFrame(M, columns=head)
    data["time"] = pd.to_datetime(data.time, format='%Y%m%d', errors='coerce')
    data["year"] = data.time.dt.year
    data["month"] = data.time.dt.month
    data["week"] = data.time.dt.week
    #data.drop("time", axis=1)
    if writ: # h5 allows your variable to be external
        dt = pd.HDFStore("drugdata.h5") # don't need to import/export! warning, though: huge
        dt['dat'] = data[head_adj] #
    return(data)

In [5]:
def test_hd5(p = 0, q = 0):
    """test data and run answers to intro quiz
    p is to print head of dataframe
    q prints quiz answers
    doesn't return anything
    mostly for access examples"""
    dt = pd.HDFStore("drugdata.h5")["dat"]

    header = dt.columns.tolist()
    # thanks @brock
    def q1(df):
        return(df.Trade_Partner_Name.unique())
    
    def q2(df):
        q2 = df.groupby('Trade_Partner_Name')['Distribution_Center_ID_(IC)'].nunique()
        q2max = q2.max()
        return(q2[q2 == q2max])
    
    def q3(df):
        q3df = df.loc[df["time"].dt.year == 2011] # can also use dt.month
        q3TotalSales = q3df.groupby('NDC')['Qty_Ord_(EU)'].sum()
        #print(q3TotalSales)
        q3sorted = q3TotalSales.sort_values(ascending = False).head()
        return(q3sorted)
    
    def q4(df):
        q4 = df['NDC'].value_counts()
        NDCLessThan60 = q4[q4 < 60]
        if (NDCLessThan60.size == 0):
            return(None)
        else:
            return(NDCLessThan60.size)
        
    def q5(df):
        q5 =  df.groupby('NDC')['Qty_Ord_(EU)'].std()
        q5max = q5.max()
        NDCHighestVariance = q5[q5 == q5max]
        return(NDCHighestVariance)
    
    def q6(df):
        q6 = df.groupby('NDC')['Qty_Ord_(EU)'].nunique()
        q6ZeroDemand = q6[q6 == 0]
        if (q6ZeroDemand.size == 0):
            return(None)
        else:
            return(q6ZeroDemand.size)
    
    if p:
        for col in header:
            print(dt[col].head())
    if q:
        answers = [q1(dt), q2(dt), q3(dt), q4(dt), q5(dt), q6(dt)]
        for i, ans in enumerate(answers):
            try:
                print('Question %d'%(i+1),  ans)
            except:
                print('Question %d'%(i+1) + str(ans))

In [5]:
impmat(); # uncomment if never built h5 file
#test_hd5(q=1) # add p=1 or q = 1 to print stuff

In [6]:
def rem_neg_vals():
    ''' if you've just imported from the mat file,
    you need to run this to change the neg vals to 0 '''
    df = pd.HDFStore("drugdata.h5")["dat"]
    # set negative values to 0
    df.loc[df['Eff_Inv_(EU)'] < 0,'Eff_Inv_(EU)'] = 0
    df.loc[df['Qty_Ord_(EU)'] < 0,'Qty_Ord_(EU)'] = 0
    df.loc[df['Eff_Inv_(EU)'].isnull(), 'Eff_Inv_(EU)'] = 0
    df.loc[df['Qty_Ord_(EU)'].isnull(), 'Qty_Ord_(EU)'] = 0
    #print(df.head())
    return(True)

In [7]:
rem_neg_vals();

In [None]:
def weeks():
    ''' gives us a list of the weeks as a datetime Series '''
    df = pd.HDFStore("drugdata.h5")["dat"]

    return(pd.to_datetime(df.time.unique()).sort_values())

In [None]:
#weeks();

## 2 - Utilities

In [8]:
def sales_exist():
    ''' want to check that every week has sales
        returns list of drug ids that have data for every year '''
    df = pd.HDFStore("drugdata.h5")["dat"]
    useless = {}
    years = [i for i in range(2007, 2018)]
    for drug in df.NDC.unique():
        useless[drug] = []
    for year in years:
        sales = df.loc[df.time.dt.year == year].groupby('NDC')['Qty_Ord_(EU)'].sum()
        for drug in df.NDC.unique():
            try:
                if sales[drug] == 0:
                    #print(drug, year) # have 0 sum
                    useless[drug].append(year)
            except:
                #print("broke by", drug, "in", year)
                useless[drug].append(year) # have NA or something?
    not_useless = []
    for did in useless.keys():
        if not useless[did]:
            not_useless.append(did)
    return(not_useless)

In [9]:
#sales_exist();

In [10]:
def top_selling(thr, p = 0):
    ''' in: minimum contributing percentage threshold
        if p, prints number and % of drugs above thr
        out: IDs of drugs above thr'''
    df = pd.HDFStore("drugdata.h5")["dat"]
    ind_total = df.groupby('NDC')['Qty_Ord_(EU)'].sum()
    sortsales = ind_total.sort_values(ascending = False)
    #print(sortsales)
    total = sum(ind_total.values)
    perc_total = 100 * sortsales / total
    clipped_above_total = perc_total[perc_total > thr]
    if p:
        print(len(clipped_above_total), sum(clipped_above_total.values))
    enough = sales_exist()
    final = [i for i in enough if i in clipped_above_total.axes[0]]
    #return(clipped_above_total.axes)
    return(final)

In [11]:
#top_selling(1.5)

In [12]:
def norm_drugs(writ = 0):
    ''' returns the data frame with only top ten drugs
        and has normed column where ordered EU is normalized with (val-mu)/sigma 
        writ = 'normed' -> h5 '''
    dl = top_selling(1.5)
    df = pd.HDFStore("drugdata.h5")["dat"]
    df.set_index("NDC", inplace=True) # use drug as index
    df = df.loc[dl] # only want drugs in top ten
    dfgb = df.groupby('NDC')['Qty_Ord_(EU)']
    sd = dfgb.std() # standard deviation for each drug
    nm = dfgb.mean() # mean for each drug

    normd = pd.DataFrame() # empty DF to hold new one
    # couldn't figure out vector without using all the memory :/
    for drug in dl:
        d_s = df.loc[drug,:] # select only one drug for now
        n_s = np.subtract(d_s["Qty_Ord_(EU)"],nm[drug]) # numerator
        
        n_v = d_s.assign(normed=np.divide(n_s, sd[drug])) # new df for drug
        normd = pd.concat([normd, n_v]) # add to return df
        
    if writ: # should we write this to h5?
        df_n = pd.HDFStore("drugdata.h5")
        df_n["normed"] = normd
    return(normd)

In [13]:
#ndf = norm_drugs(1)
#print(ndf.head)

In [14]:
def make_x(Y, a):
    ''' creates lag df 
        Y is Y vector 
        a is # lags '''
    #cols = ['t-'+str(i) for i in range(1, a+1)]
    X = pd.DataFrame()
    for i in range(1, a+1):                #makes multi-dimensional input
        # each datapoint works off of the past 'a' datapoints 
        X = pd.concat([X, Y.shift(i)], axis=1)
    
    return(X[a:])

In [15]:
def sales(Year):
    """in: range of dates want studied 
    want to return the list of sales per date
    todo: break up by location if we want"""
    df = pd.HDFStore("drugdata.h5")["dat"]
    
    sel_drugs = top_selling(1.5) # list of drug ids
    dates = df.loc[df.year == Year] # choose only given year
    # gives DF of drugs by week; can change to 
    # ['NDC', 'time', DISTRO_id] if we want later
    window = dates.groupby(['NDC', "time"])['Qty_Ord_(EU)'].sum()
    filt_window = window.loc[sel_drugs] # only want top drugs
    return(filt_window)

In [16]:
# s2008 = sales(2008)
# print(s2008)
# s2008[4.][:5]
# s08_4 = s2008[4]
# print(s08_4[:5])

In [17]:
def smape(f, d):
    ''' symmetric mean absolute percentage error
    in: vectors f = y_hat, d = y 
    out: the smape, yo '''
    n = len(f)
    val = np.sum(nabs(f - d)/(nabs(f) + nabs(d)))
    return(val/n)

In [18]:
def frame_gen(year):
    ''' in: the year starting the frame
        out: 2 dfs, 3 years for training and 4th for test'''
    window = []
    for i in range(3): #4 to match AR model
        window.append(sales(year + i))
    # make a table of 3 years
    window = pd.concat(window)
    test_frame = sales(year + 3)
    return(window, test_frame)

In [19]:
# a,b = frame_gen(2008)
# print(a)
# plt.plot(a[4])
# plt.show()

## 3 - The meaty bits!

In [20]:
def plot_drug_sales():
    #df = pd.HDFStore("drugdata.h5")['normed']
    df = pd.HDFStore("drugdata.h5")['dat']

    dl = top_selling(1.5)
    
    for drug in dl:
        sub = df.loc[df.NDC == drug]
        ndcSales = sub.groupby("time")["Qty_Ord_(EU)"].sum()
        fig, ax = plt.subplots()
        ax.plot(ndcSales, 'bo')
        ax.set(xlabel='Year', ylabel='Extended Units',
           title='Total Sales drug ' + str(drug))
        ax.grid()
        plt.show()
        
        autocorrelation_plot(ndcSales)
        plt.show()
        
#plot_drug_sales()

In [21]:
def SVR_stuff(Y_train, Y_test, p=0, a=10):
    ''' input: testing and training data 
        out: support vector regression score(?) '''
    X_train = make_x(Y_train, a)
    X_test = make_x(Y_test, a)
    Y_train = Y_train[a:]
    Y_test = Y_test[a:]

    if 0:
        print("X_train shape: ", X_train.shape)
        print("X_test shape: ", X_test.shape)
        print("Y_train shape: ", Y_train.shape)
        print("Y_test shape: ", Y_test.shape)

    ks = ["rbf", "linear", "poly", "sigmoid"]    
    for k in ks:
        print(k)
        regr = SVR(kernel = k, C=10)               #creates/fits model
        regr.fit(X_train, Y_train)
        yhat_trn = regr.predict(X_train)
        yhat_tst = regr.predict(X_test)

        if p:
            plt.plot(Y_train, color='red')
            ytr = pd.Series(yhat_trn, index=Y_train.index)
            plt.plot(ytr, color='blue')
            plt.show()
            ytt = pd.Series(yhat_tst, index=Y_test.index)
            plt.plot(ytt, color='blue')
            plt.plot(Y_test, color='red')
            plt.show()
            error = smape(ytt, Y_test)              #change to SMAPE?
            print("Error: ", error)
            print(mse(Y_test, ytt))
    return(error) 

In [22]:
def ARIMA_stuff(train, test, p=0):
    
    history = [x for x in train]
    predictions = []
    
    for t in range(len(test)): # only predicting one ahead
        model = ARIMA(history, order=(10,1,0))
        try: model_fit = model.fit(disp=0)
        except (ValueError, LinAlgError): pass
        output = model_fit.forecast()
        yhat = output[0][0] # i swear, who made this output decision? a list of numpy arrays??
        predictions.append(yhat)
        obs = test[t]
        history.append(obs)
        history.pop(0)

    err = smape(test, predictions)

    if p:
    #try:
        plt.plot(test)
        pred_fix = pd.Series(predictions, index=test.index) # problem with the dates not lining up
        plt.plot(pred_fix, color='red')
        plt.xlabel('time (wk)')
        plt.ylabel('Extended Units')
        plt.title('Predicted vs Actual ARIMA')
        plt.grid()
        plt.show()
    #except: return([test, predictions])    
    return(err)

In [23]:
#ARIMA_stuff()

In [24]:
def NN_stuff(Y_train, Y_test, p=0, a = 5):

    X_train = make_x(Y_train, a)
    Y_train = Y_train[a:]

    X_test = make_x(Y_test, a)
    Y_test = Y_test[a:]
    
    [n, m] = X_train.shape
    
    model = Sequential()

    model.add(Dense(5, input_dim = a, kernel_initializer='normal', activation='relu'))
    model.add(Dense(11, kernel_initializer='normal', activation = 'relu'))
    model.add(Dense(11, kernel_initializer='normal', activation = 'relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mse', optimizer='adam')
    
    model.fit(X_train, Y_train, epochs = 1000, verbose = 0)
    
    
    yhat_tst = [item[0] for item in model.predict(X_test)]
    
    if p:
        ytt = pd.Series(yhat_tst, index=Y_test.index)
        plt.plot(ytt, color='orange')
        plt.plot(Y_test, color='blue')
        plt.show()

    error = smape(yhat_tst, Y_test)
    return(error)

In [40]:
def lasso_stuff(Y_train, Y_test, p = 0, a = 10):

    X_train = make_x(Y_train, a)
    X_test = make_x(Y_test, a)
    Y_train = Y_train[a:]
    Y_test = Y_test[a:]

    if p:
        print("X_train shape: ", X_train.shape)
        print("X_test shape: ", X_test.shape)
        print("Y_train shape: ", Y_train.shape)
        print("Y_test shape: ", Y_test.shape)

    regr = LassoCV()               #creates/fits model
    regr.fit(X_train, Y_train)
    yhat_trn = regr.predict(X_train)
    yhat_tst = regr.predict(X_test)
    ytr = pd.Series(yhat_trn, index=Y_train.index)
    ytt = pd.Series(yhat_tst, index=Y_test.index)
    error = smape(ytt, Y_test) 
    if p:
        plt.plot(Y_train, color='red')
        plt.plot(ytr, color='blue')
        plt.show()
        plt.plot(ytt, color='blue')
        plt.plot(Y_test, color='red')
        plt.show()

        print("Error: ", error)
        print(mse(Y_test, ytt))
    return(error)

In [None]:
#lasso_stuff(1,2)

In [None]:
def Auto_regress(drug, X, lag_size, plots=0):
    ''' need model for each drug id; maybe pass in main?
        test data, train data
        lag is number of previous vars
        but, model chooses lag size?? '''
    # sauce: https://machinelearningmastery.com/autoregression-models-time-series-forecasting-python/
    #from sklearn.svm import SVR
    from statsmodels.tsa.ar_model import AR
    
    # at this point all drugs have all data for all years, so we can generalize
    X_trn = X[drug].values[:157] # first 3 years
    X_tst = X[drug].values[157:] # next
    if plots:
        from statsmodels.graphics.tsaplots import plot_acf
        plot_acf(X_trn, lags=lag_size)
        plt.show()
    model = AR(X_trn)
    fits = model.fit()
    print("Thetas:", fits.params)
    print("lag:", fits.k_ar)
    
    pred = fits.predict(start=len(X_trn), end=len(X_trn)+len(X_tst)-1, dynamic=True)
    from sklearn.metrics import mean_squared_error as mse
    print("MSE:", mse(X_tst, pred))
    plt.plot(X_tst)
    plt.plot(pred, color='red')
    plt.show()

## Plots

In [59]:
def main():
    dt = pd.HDFStore("drugdata.h5")["dat"]
    # we are gonna add 3 to this. only want full years too, why not.
    spread = [i for i in range(2008, 2014)]
    dlist = top_selling(1.5)
    with open("best.txt", "w") as f:
        for year in spread: # sliding window for analysis
            [trn, tst] = frame_gen(year)
            for drug in dlist:

                tn = trn[drug]
                tt = tst[drug]
                #seth = svr_stuff(tst, trn)
                collin = NN_stuff(tst, trn)
                jack2 = lasso_stuff(tn, tt)
                
                with warnings.catch_warnings(): #loud arima
                    warnings.filterwarnings("ignore")
                    jack = ARIMA_stuff(tn, tt)
                    
                which = [collin, jack, jack2]
                who = ["collin", "jack", "jack again"]
                best_for_year = who[which.index(min(which))]
                #print("The best for", year, "is", best_for_year, "on drug", drug)
                outstr_all = str(year)+","+str(drug)+",0"+","+str(collin)+"\n"
                outstr_all += str(year)+","+str(drug)+",1"+","+str(jack)+"\n"
                outstr_all += str(year)+","+str(drug)+",2"+","+str(jack2)+"\n"
                #outstr = str(year)+","+str(drug)+","+str(best_for_year)+","+str(min(which))+"\n"
                f.write(outstr_all)
        f.closed

In [None]:
main()

In [25]:
# testing block; this bit is slow-ish
# [trn, tst] = frame_gen(2008)
# tn = trn[4]
# tt = tst[4]

In [66]:
#maybe = lasso_stuff(tn, tt)
#ars = ARIMA_stuff(tn, tt, 1)
#js = SVR_stuff(tn, tt, 1)
#xt = make_x(tn, 10)
#cl = NN_stuff(tn, tt, 1)

In [54]:
def get_some_tables():
    func = {"collin":NN_stuff, "jack":ARIMA_stuff, "jack again":lasso_stuff}
    with open("best.txt","r") as f:
        howmany = 0
        for line in f:
            l = line.split(",")
            err = float(l[-1])
            howmany += (err < .1)
        print(howmany)
    f.closed

In [55]:
get_some_tables()

13
