In [None]:
# predicting level of patient satisfaction based on signatures of feedback from the past

def elnet2(csv_file, depvar='q9', ignore_nan=True, sig_min_day=735008, test=[736621, 736680], folds=5, seed=1,log=False,degs=2,sample=None,sigs_with_depvar=False):
    """
    
    Runs elastic net model with cross-validation
    
    
    inputs:
        - "csv_file" (str): name of .csv-formatted file containing topic proportions for each message
        - depvar (str): name of dependent variable to be considered when constructing the paths. Reviews which don't
                        contain this variable will be ignored. If more than one review is posted at a given day (very 
                        low likelihood of this), feedback scores are averaged for the given day.
        - ignore_nan (logical): whether to exclude reviews which have no value for the dependent variable. Reviews with
                        a missing dependent variable are excluded by default
                        NOTE: If reviews with missing dependent variable are to be included in the output of this 
                        function, the make_sigs() function will only run without error if parameter include_depvar 
                        is set to False.
        - min_day (int): first cut-off day from which reviews are considered for construction of signatures. First 
                        day of year 0 AD equals to 1, second day of that year equals to 2, and so on. 
                        In the dataset used here, day 735008 (the default value) is the first day for which more 
                        feedback is available. You can also set min_day = None, which means that even the earliest 
                        available feedback will be used for signatures.
        - max_day (int): last cut-off day for which reviews are considered for making signatures. First day of 
                        year 0 AD equals to 1, second day of that year equals to 2, and so on. Default value is 736678.
        - mtrain_pred_day (int): The day from which dependent variable data are used for model training. It should be
                        a value between 735008 and 736780 if the example dataset is used. Default value is 736679.
        - mtest_pred_day (int): The day from which dependent variable data are used for model testingIt should be
                        a value between 735008 and 736780 if the example dataset is used. Default value is 736680.
        - folds (int): number of folds used for elastic net cross-validation. The default is 5
        - seed (int): number selecting quasi random choice of data points for train and test sets in elastic net
          cross-validation. 'seed = None' means that each Lasso cross-validation will take in different
          choices of training and test sets of data points. The default value is 1
        - log (logical): Defaults to false
        - degs (int): number of degrees used for computation of a (log)signatures. The default is 2
        - sample (int): =None
        - sigs_with_depvar (logical): =False
    
    output:
        - trained elastic net model according to set model parameters
        - elastic net cross-validation outcomes for test and train datasets
    
    Example:
    
    
    To do:
    - Add possibility for mtrain_pred_day and mtest_pred_day to take in lists of days. That may be used to increase 
    the number of datapoints available for model training and testing
    """
    
    window = max(test) - min(test) + 1
    
    # checking the validity of model parameters
    if ignore_nan == False and sigs_with_depvar == True:
        print("""WARNING: you can't run this function with parameters ignore_nan = False, and sigs_with_depvar = True. 
                This would mean you want to construct signatures from paths which have missing data.
              """)
        return(None)
    elif sigs_with_depvar == True:
        if sig_min_day in list(range(min(test)-window,max(test))):
            print("""WARNING: Your 'sig_min_day' parameter doesn't allow to compute prediction.
            It needs to be an early enough date to make model training/testing possible.
            """)
            return(None)
    

    train_sig_max = 2*min(test) - max(test) - 2
    
    # prepare independent variable data for model training and testing
    x_train = make_paths(csv_file, min_day=sig_min_day, max_day=train_sig_max, depvar= depvar, ignore_nan = ignore_nan)
    x_train = make_sigs(x_train, degs, 'sigs_xtrain.csv', log=log, sample=sample,include_depvar=sigs_with_depvar)
    

    sigmaxday_test = min(test) - 1
    sigminday_test = sig_min_day + window
    x_test = make_paths(csv_file, min_day=sigminday_test, max_day=sigmaxday_test, depvar= depvar, ignore_nan = ignore_nan)
    x_test = make_sigs(x_test, degs, 'sigs_xtest.csv', log=log, sample=sample,include_depvar=sigs_with_depvar)
    
    # prepare dependent variable data
    y_train = make_depvar(csv_file,depvar, list(range(train_sig_max+1,min(test))))
    y_test = make_depvar(csv_file,depvar, list(range(min(test),max(test)+1)))

    
    # the elastic net model with cross-validation
    regr = ElasticNetCV(cv=folds, random_state=seed)
    
    y_train = pd.DataFrame(y_train)
    y_tmp = [str(x) for x in list(y_train.columns)]
    x_tmp = [str(x) for x in list(x_train.columns)]
    y_train.columns = y_tmp
    x_train.columns = x_tmp
    
    joined = pd.merge(left=y_train,right=x_train, left_on='0', right_on='0', how='inner')
    
    y_train = np.array(joined['1_x'].values.tolist())
    x_train = np.array(joined[joined.columns[2:]].values.tolist())
    
    elnet_ready = regr.fit(x_train, y_train)

    
    # prepare y_test and x_test
    y_test = pd.DataFrame(y_test)
    y_tmp = [str(x) for x in list(y_test.columns)]
    x_tmp = [str(x) for x in list(x_test.columns)]
    y_test.columns = y_tmp
    x_test.columns = x_tmp
    
    joined = pd.merge(left=y_test,right=x_test, left_on='0', right_on='0', how='inner')
    
    y_test = np.array(joined['1_x'].values.tolist())
    x_test = np.array(joined[joined.columns[2:]].values.tolist())
    
    
    #train and test prediction outcomes
    yhat_train = elnet_ready.predict(x_train) #x_train
    yhat_test = elnet_ready.predict(x_test) #x_test

    object_to_return = [elnet_ready, y_train, yhat_train, y_test,  yhat_test]
    return object_to_return


# log=False,degs=2,sample=None,sigs_with_depvar=False)
results = elnet2('r_output.csv', depvar='q9', ignore_nan=True, sig_min_day=735008, 
                test=[736621, 736680], folds=5, seed=1,log=False,degs=2,
                sample=None, sigs_with_depvar=True)



print('ok')

In [None]:
# train error prediction with signatures
print(mean_squared_error(results[1],results[2]))

# test error prediction with signatures
print(mean_squared_error(results[3],results[4]))


# baseline test error
dummy = [np.mean(results[1])]*len(results[3])
print(mean_squared_error(results[3],dummy))