In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import json
import sys


In [2]:
def load_dataframe(path):
    return pd.read_csv(path)

def load_user_data(usr_df, label, embedding_type=None, time_step='week'):
    """
    args:
        usr_df: DataFrame containing each user's weekly/daily data per row
        label: String for which field to treat as label
        embedding_type: String to toggle between 'w2v' or 'bert'. Default is univariate forecast
        time_step: String to toggle between week or daily data. Using the column name(week or day_id)
    return:
        dictionary of key = user_id and value = sequence of user data over all available time
    """
    usr_dict = defaultdict(list)

    for idx,row in usr_df.iterrows(): # load all users and format into dictionary
        if embedding_type:
            embedding = np.array(json.loads(row[embedding_type])) # embeddings are stored as json


        target = row[label]

        try:
            other_vars = embedding
        except:
            other_vars = []

        # append other variables here if desired (i.e. intensity along with embeddings for Aff pred)
        # if label = 'affect':
        #   other_vars.append(row['intensity'])

        if time_step == 'week':
            key = 'week'
        else:
            key = 'day_id'

        if len(other_vars) > 0:
            usr_data = (row[key], np.append(target, other_vars))
        else:
            usr_data = (row[key], target)

        usr_dict[row['user_id']].append(usr_data)

    return usr_dict

def gen_train_data(user_data, n):
    """
    args:
        user_data: Dictionary containing each user's full history as a sequence
        n: Integer denoting the maximum history to use for the model
    return:
        train_data: numpy array of chunked user history
        train_labels: numpy array of label per user history sequence
    """

    train_data, labels = [], []
    for k,v in user_data.items():
        usr_all_history = user_data[k][:15] # eahc user has maximum 14 time-steps
        usr_train_data = []
        usr_train_labels = []

        for i in range(15-n): # only go back as far as n
            curr_train = []
            curr_label = []
            for j in range(n): # for each time-step
                if j < n - 1:
                    curr_train.append(usr_all_history[j+i][1])
                elif j == n -1:
                    curr_train.append(usr_all_history[j+i][1])

                    # assumes multi-variate, catches univariate case
                    # [0] grabs the target which is always first element of that week
                    try:
                        curr_label.append(usr_all_history[j+i+1][1][0])
                        features = [f for week in curr_train for f in week]
                    except:
                        curr_label.append(usr_all_history[j+i+1][1])
                        features = [f for f in curr_train]

            train_data.append(features)
            labels.append(curr_label[0])

    return np.array(train_data), np.array(labels)

def gen_test_data(user_data, n):
    """
    args:
        user_data: Dictionary containing each user's full history as a sequence
        n: Integer denoting the maximum history to use for the model
    return:
        test_data: numpy array of chunked user history
        test_labels: numpy array of label per user history sequence
    """
    test_data, test_labels = [], []
    for k,v in user_data.items():
        usr_test_history = user_data[k][-5:] # Grab remaining weeks in user's sequence for testing
        usr_test_data = []
        usr_test_labels = []
        for i in range(4): # 4 test weeks
            features = []
            for j in range(1,n):
                features = np.append(user_data[k][:][(-5+i)-j][1], features)

            usr_test_embeds = np.append(features, usr_test_history[i][1])
            test_data.append(usr_test_embeds)

            try:
                test_labels.append(usr_test_history[i+1][1][0])
            except:
                test_labels.append(usr_test_history[i+1][1])

    return test_data, test_labels

In [3]:
import math
import numpy as np
from os import listdir
import pandas as pd
import json
from sklearn.linear_model import Ridge
import argparse
import copy
from scipy.stats import sem
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import sys

In [9]:
df = load_dataframe('weekly_all_labels.csv')
usr_seqs = load_user_data(df, 'affect', embedding_type="bert")
train_data, train_labels = gen_train_data(usr_seqs, 10)
test_data, test_labels = gen_test_data(usr_seqs, 10)


In [4]:
train_set, train_gp, running_mae = pd.read_pickle("GP_results_4mix.pkl")

In [5]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0) #


In [6]:
def fixtensor(x,y):
    x = x.tolist()
    y = y.tolist()
    x = x - np.min(x)


    fixed_x = list(range(60))
    fixed_y = [np.nan]*60
    for ii in range(len(x)):
        fixed_y[x[ii]] = y[ii]

    z = np.array([fixed_x,fixed_y]).transpose()
    imp.fit(z)
    zz = imp.transform(z)

    return zz.transpose()

In [55]:
def build_df(train_set,train_gp):
    true_data = []
    gp_data = []
    true_labels = []
    gp_labels = []
    for ii in range(len(train_set)):
        true_data.append(fixtensor(train_set[ii][0][0][0],train_set[ii][0][1][0])[1])
        gp_data.append(fixtensor(train_set[ii][0][0][0],train_gp[ii][0][0])[1])
        true_labels.append(train_set[ii][0][1][1].tolist()[0])
        gp_labels.append(train_gp[ii][1][0].tolist()[0])

    true_data = np.array(true_data)
    true_labels = np.array(true_labels)
    gp_data = np.array(gp_data)
    gp_labels = np.array(gp_labels)

    test_data = true_data[-100:]
    test_labels = true_labels[-100:]
    test_gp_data = gp_data[-100:]
    test_gp_labels = gp_labels[-100:]

    train_data = true_data[:-100]
    train_labels = true_labels[:-100]
    train_gp_data = gp_data[:-100]
    train_gp_labels = gp_labels[:-100]

    train_df = pd.DataFrame(train_data)
    train_labels_df = pd.DataFrame(train_labels)
    train_labels_df.columns = ['label']
    train_labels_df['label'] = train_labels_df['label'].astype(float)

    train_gp_df = pd.DataFrame(train_gp_data)
    train_gp_labels_df = pd.DataFrame(train_gp_labels)
    train_gp_labels_df.columns = ['label']
    train_gp_labels_df['label'] = train_gp_labels_df['label'].astype(float)

    test_df = pd.DataFrame(test_data)
    test_labels_df = pd.DataFrame(test_labels)
    test_labels_df.columns = ['label']
    test_labels_df['label'] = test_labels_df['label'].astype(float)

    test_gp_data = pd.DataFrame(test_gp_data)
    test_gp_labels = pd.DataFrame(test_gp_labels)
    test_gp_labels.columns = ['label']
    test_gp_labels['label'] = test_gp_labels['label'].astype(float)

    return train_df, train_labels_df, train_gp_df, train_gp_labels_df, test_df, test_labels_df, test_gp_data, test_gp_labels

In [51]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [56]:
true_res = []
gp_res = []

for ii in range(9):
    train_set, train_gp, running_mae = pd.read_pickle("GP_results_" + str(ii+4) + "mix.pkl")
    train_df, train_labels_df, train_gp_df, train_gp_labels_df, test_df, test_labels_df, test_gp_data, test_gp_labels = build_df(train_set,train_gp)
    model = Ridge(alpha=.01)
    model_gp = Ridge(alpha=.01)
    model.fit(train_df, train_labels_df.label)
    model_gp.fit(train_gp_df, train_gp_labels_df.label)

    true_preds = model.predict(test_df)
    true_mse = mean_squared_error(test_labels, true_preds)
    true_corr = np.corrcoef(test_labels, true_preds)[0,1]
    true_smape = smape(test_labels, true_preds)
    true_res.append((true_corr,true_mse,true_smape))

    gp_preds = model_gp.predict(test_df)
    gp_mse = mean_squared_error(test_labels,gp_preds)
    gp_corr = np.corrcoef(test_labels, gp_preds)[0,1]
    gp_smape = smape(test_labels, gp_preds)
    gp_res.append((gp_corr,gp_mse,gp_smape))

In [59]:
true_res[0]

(0.8185047838131291, 0.178172926846121, 16.47476100781459)