## 0 - Imports

In [4]:
%load_ext autoreload
%autoreload

In [78]:
import scipy.io as spio
import pandas as pd
import numpy as np
from numpy import absolute as nabs
import matplotlib.pyplot as plt

## 1 - Load, clean, test data

In [1]:
def impmat(fname = 'M_processed.mat', writ = True):
    ''' import matlab crap, and turn it to pickles (or return panda df)'''
    mat = spio.loadmat(fname, squeeze_me=True)
    M = mat['M'] 
    head = ['time','ndc1','ndc2','ndc3',
            'Trade_Partner_Name', 'Distribution_Center_State','NDC','Distribution_Center_ID_(IC)',
    'Distribution_Center_Zip','Eff_Inv_(EU)','Eff_Inv_(PU)','Qty_Ord_(EU)',
            'Qty_Ord_(PU)']
    # get rid of ndc 1,2,3 because they're pieces of NCD
    # also get rid of purchase units, just use eatable
    # also get rid of states and zip code
    needed = [0,4,6,7,9,11]
    head_adj = [head[i] for i in needed]
    data = pd.DataFrame(M, columns=head)
    data["time"] = pd.to_datetime(data["time"], format='%Y%m%d', errors='coerce')

    if writ: # h5 allows your variable to be external
        dt = pd.HDFStore("drugdata.h5") # don't need to import/export! warning, though: huge
        dt['dat'] = data[head_adj] #
    return(data)


In [2]:
def test_hd5(p = 0, q = 0):
    """test data and run answers to intro quiz
    p is to print head of dataframe
    q prints quiz answers
    doesn't return anything
    mostly for access examples"""
    dt = pd.HDFStore("drugdata.h5")["dat"]

    header = dt.columns.tolist()
    # thanks @brock
    def q1(df):
        return(df.Trade_Partner_Name.unique())
    
    def q2(df):
        q2 = df.groupby('Trade_Partner_Name')['Distribution_Center_ID_(IC)'].nunique()
        q2max = q2.max()
        return(q2[q2 == q2max])
    
    def q3(df):
        q3df = df.loc[df["time"].dt.year == 2011] # can also use dt.month
        q3TotalSales = q3df.groupby('NDC')['Qty_Ord_(EU)'].sum()
        #print(q3TotalSales)
        q3sorted = q3TotalSales.sort_values(ascending = False).head()
        return(q3sorted)
    
    def q4(df):
        q4 = df['NDC'].value_counts()
        NDCLessThan60 = q4[q4 < 60]
        if (NDCLessThan60.size == 0):
            return(None)
        else:
            return(NDCLessThan60.size)
        
    def q5(df):
        q5 =  df.groupby('NDC')['Qty_Ord_(EU)'].std()
        q5max = q5.max()
        NDCHighestVariance = q5[q5 == q5max]
        return(NDCHighestVariance)
    
    def q6(df):
        q6 = df.groupby('NDC')['Qty_Ord_(EU)'].nunique()
        q6ZeroDemand = q6[q6 == 0]
        if (q6ZeroDemand.size == 0):
            return(None)
        else:
            return(q6ZeroDemand.size)
    
    if p:
        for col in header:
            print(dt[col].head())
    if q:
        answers = [q1(dt), q2(dt), q3(dt), q4(dt), q5(dt), q6(dt)]
        for i, ans in enumerate(answers):
            try:
                print('Question %d'%(i+1),  ans)
            except:
                print('Question %d'%(i+1) + str(ans))

In [7]:
#impmat(); # uncomment if never built h5 file
#test_hd5(q=1) # add p=1 or q = 1 to print stuff

In [8]:
def rem_neg_vals():
    ''' if you've just imported from the mat file,
    you need to run this to change the neg vals to 0 '''
    df = pd.HDFStore("drugdata.h5")["dat"]
    # set negative values to 0
    df.loc[df['Eff_Inv_(EU)'] < 0,'Eff_Inv_(EU)'] = 0
    df.loc[df['Qty_Ord_(EU)'] < 0,'Qty_Ord_(EU)'] = 0
    df.loc[df['Eff_Inv_(EU)'].isnull(), 'Eff_Inv_(EU)'] = 0
    df.loc[df['Qty_Ord_(EU)'].isnull(), 'Qty_Ord_(EU)'] = 0
    #print(df.head())
    return(True)

In [9]:
rem_neg_vals();

## 2 - Early queries

In [56]:
def weeks():
    ''' gives us a list of the weeks as a datetime Series '''
    df = pd.HDFStore("drugdata.h5")["dat"]

    return(pd.to_datetime(df.time.unique()).sort_values())

In [18]:
weeks();

In [138]:
def sales_exist():
    ''' want to check that every week has sales
        returns list of drug ids that have data for every year '''
    df = pd.HDFStore("drugdata.h5")["dat"]
    useless = {}
    years = [i for i in range(2007, 2018)]
    for drug in df.NDC.unique():
        useless[drug] = []
    for year in years:
        sales = df.loc[df.time.dt.year == year].groupby('NDC')['Qty_Ord_(EU)'].sum()
        for drug in df.NDC.unique():
            try:
                if sales[drug] == 0:
                    #print(drug, year) # have 0 sum
                    useless[drug].append(year)
            except:
                #print("broke by", drug, "in", year)
                useless[drug].append(year) # have NA or something?
    not_useless = []
    for did in useless.keys():
        if not useless[did]:
            not_useless.append(did)
    return(not_useless)

In [142]:
sales_exist();

In [145]:
def top_selling(thr, p = 0):
    ''' in: minimum contributing percentage threshold
        if p, prints number and % of drugs above thr
        out: IDs of drugs above thr'''
    df = pd.HDFStore("drugdata.h5")["dat"]
    ind_total = df.groupby('NDC')['Qty_Ord_(EU)'].sum()
    sortsales = ind_total.sort_values(ascending = False)
    #print(sortsales)
    total = sum(ind_total.values)
    perc_total = 100 * sortsales / total
    clipped_above_total = perc_total[perc_total > thr]
    if p:
        print(len(clipped_above_total), sum(clipped_above_total.values))
    enough = sales_exist()
    final = [i for i in enough if i in clipped_above_total.axes[0]]
    #return(clipped_above_total.axes)
    return(final)

In [146]:
top_selling(1.5)

[4.0, 7.0, 8.0, 32.0, 55.0, 125.0, 141.0, 145.0, 149.0, 154.0]

## 3 - The meaty bits!

In [149]:
def sales(Year):
    """in: range of dates want studied 
    want to return the list of sales per date
    todo: break up by location if we want"""
    df = pd.HDFStore("drugdata.h5")["dat"]
    
    sel_drugs = [i for i in top_selling(1.5)] # list of drug ids
    dates = df.loc[df.time.dt.year == Year] # choose only given year
    # gives DF of drugs by week; can change to 
    # ['NDC', 'time', DISTRO_id] if we want later
    window = dates.groupby(['NDC', 'time'])['Qty_Ord_(EU)'].sum()
    filt_window = window.loc[sel_drugs] # only want top drugs
    return(filt_window)

In [150]:
sales(2008);

In [23]:
def smape(f, d):
    ''' symmetric mean absolute percentage error
    in: vectors f = y_hat, d = y 
    out: the smape, yo '''
    n = len(f)
    num = np.sum(nabs(f - d))
    denom = np.sum(nabs(f) + nabs(d))
    return((1/n) * num/denom)

In [183]:
def frame_gen(year):
    ''' in: the year starting the frame
        out: 2 dfs, 3 years for training and 4th for test'''
    window = []
    for i in range(4): #3? switched to match AR model
        window.append(sales(year + i))
    # make a table of 3 years
    window = pd.concat(window)
    #test_frame = sales(year + 3)
    return(window)#, test_frame)

In [30]:
def svr_try_0(tst, trn):
    ''' input: testing and training data 
        out: support vector regression score(?) '''
    
    from sklearn import preprocessing as pp
    X = df.loc[:, 2:].values
    y = df.loc[:, 1].values
    le = pp.LabelEncoder()
    #lb = pp.LabelBinerizer()
    y = le.fit_transform(y)
    
    from sklearn.cross_validation import train_test_split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
    print(len(x_train))
    from sklearn.svm import SVR
    with open("out.txt", "w") as f:
        for i in range(-2, 2):
            svm = SVC(kernel="linear", C=10**i, random_state=0)
            svm.fit(x_train, y_train)
            f.write("c = " + str(10**i) + ", accuracy = " + 
            str(round(svm.score(x_test, y_test), 3)) + "\n")


In [26]:
def ARIMA(tst, trn):
    ''' Alright, Jack, do your thing. 
    I don't wanna touch this one. '''
    from statsmodel.tsa.arima_model import ARIMA
    pass

In [27]:
def NN_stuff(tst, trn):
    ''' All you, Collin '''
    pass

In [28]:
def lasso_stuff(tst, trn):
    ''' if we don't like those, we can try this too '''
    pass

In [163]:
def testing_setup():
    df = pd.HDFStore("drugdata.h5")["dat"]
    [trn, tst] = frame_gen(2008)
    dl = top_selling(1.5)
    for drug in dl:
        print(drug, trn[drug])

## Plots

In [190]:
def Auto_regress(drug, X, lag_size, plots=0):
    ''' need model for each drug id; maybe pass in main?
        test data, train data
        lag is number of previous vars
        but, model chooses lag size?? '''
    # sauce: https://machinelearningmastery.com/autoregression-models-time-series-forecasting-python/
    #from sklearn.svm import SVR
    from statsmodels.tsa.ar_model import AR
    
    # at this point all drugs have all data for all years, so we can generalize
    X_trn = X[drug].values[:157] # first 3 years
    X_tst = X[drug].values[157:] # next
    if plots:
        from statsmodels.graphics.tsaplots import plot_acf
        plot_acf(X_trn, lags=lag_size)
        plt.show()
    model = AR(X_trn)
    fits = model.fit()
    print("Thetas:", fits.params)
    print("lag:", fits.k_ar)
    
    pred = fits.predict(start=len(X_trn), end=len(X_trn)+len(X_tst)-1, dynamic=True)
    from sklearn.metrics import mean_squared_error as mse
    print("MSE:", mse(X_tst, pred))
    plt.plot(X_tst)
    plt.plot(pred, color='red')
    plt.show()

In [191]:
def main(lag_size):
    dt = pd.HDFStore("drugdata.h5")["dat"]
    # we are gonna add 3 to this. only want full years too, why not.
    spread = [i for i in range(2008, 2013)]
    dlist = top_selling(1.5)
    
    thetaset = np.zeros(shape=(len(dlist), lag_size))# what do we want here???
    thetaset = pd.DataFrame(thetaset, index=dlist)
    for year in spread: # sliding window for analysis
        X = frame_gen(year)
        for drug in dlist:
            Auto_regress(drug,X,lag_size)
        return(0)
#         seth = svr_stuff(tst, trn)
#         collin = nn_stuff(tst, trn)
#         jack = ARIMA(tst, trn)
#         maybe = lasso_stuff(tst, trn)
#         which = [seth, collin, jack, maybe]
#         who = ["seth", "collin", "jack", "maybe"]
#         best_for_year = who[which.index(min(which))]
#         print("The best for", year, "is", best_for_year)
        
    

In [189]:
main(10)

Thetas: [ -2.53151887e+05   2.66899306e-01   2.87649176e-01   1.09347682e-01
   7.83628909e-02   2.56090739e-01  -7.19345227e-02  -1.89046394e-01
   2.05497932e-01  -4.30616389e-02   8.47304849e-02  -1.55375569e-01
   2.83222764e-02   1.31231028e-01]
lag: 13
MSE: 3.21873256852e+12
Thetas: [  2.37873932e+05   3.20859404e-01   2.69673384e-01   5.76518303e-02
   1.40373714e-01   1.34357687e-01  -8.22844634e-02  -8.92731745e-02
   6.59051487e-02   4.72977527e-02  -9.38356295e-02   4.40095060e-02
  -6.46077536e-02   2.52229486e-02]
lag: 13
MSE: 151229615977.0
Thetas: [ -3.50372707e+04   1.03009834e-01   2.35838907e-02   2.18393791e-01
   9.28332923e-02   2.27667419e-01   1.12834534e-01  -6.87576875e-02
   1.07012636e-01   3.19997227e-02   6.79399986e-02   2.56175693e-02
   2.64057308e-02   1.46675072e-02]
lag: 13
MSE: 32225542952.5
Thetas: [  5.04807334e+04   5.78984894e-02   4.80352043e-03   1.10902242e-01
   1.36516665e-01   2.03507154e-01   9.56274071e-02  -2.07606367e-02
  -9.14241318e-

0

In [None]:
cors: [5, 6, 0, 6, 5, 0, 5, 6, 6, 8]