In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [None]:
import torch


In [None]:
from erisk import erde_evaluation, erde_mem

def erde(out_file, o):
    erde_evaluation("datasets/task_1_depression/risk-golden-truth-test.txt", out_file, o)

## Dataset

In [None]:
y_true = dict(
    pd.read_csv(
        "datasets/task_1_depression/risk-golden-truth-test.txt",
        sep="\t",
        header=None,
    ).to_records(index=False)
)


In [None]:
df = pd.read_csv("datasets/task_1_depression/depression_merged.csv") # output by read.ipynb 
subjects = sorted(df.user.drop_duplicates().to_list())
S = len(subjects)
subject_lookup =  dict(zip(subjects, range(S)))

In [None]:
import liwc
liwc_parse, categories = liwc.load_token_parser('dic/LIWC2007_English080730.dic')
K = len(categories)
category_lookup = dict(zip(categories, range(K)))

In [None]:
def to_liwc_token(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0).lower()

We generate two time serie for each subject: one for text and one for lexical represetation of the former

In [None]:
x_lex_ts = dict()
x_text_ts = dict()

In [None]:
for subj, dfi in tqdm(df.groupby("user")):
    x_lex_ts[subj] = list()
    x_text_ts[subj] = list()

    for n, text in enumerate(dfi.sort_values("date_time").text):
        categ_freq = np.zeros(K)
        for t in to_liwc_token(text):
            for m in liwc_parse(t):
                k = category_lookup[m]
                categ_freq[k] += 1

        if not categ_freq.sum():
            continue

        categ_freq /= categ_freq.sum()

        x_lex_ts[subj].append(categ_freq)
        x_text_ts[subj].append(text.strip())
    x_lex_ts[subj] = np.array(x_lex_ts[subj])


## M-LSTM

In [None]:
MIN_LENGTH = 20
x_lex_ts = {k: v for k, v in x_lex_ts.items() if not v.shape[0] < MIN_LENGTH}
x_sign_lex_ts = {k: np.sign(v) for k, v in x_lex_ts.items()}
y_true = {k: v for k, v in y_true.items() if k in x_lex_ts}


Using lexical time series, we classify them with a multivariate LSTM

In [None]:
X, y = np.zeros((len(y_true), 64, 2000)), []
for i, subj in enumerate(y_true):
    y.append(y_true[subj])
    X_unpad = x_lex_ts[subj]

    ts_len = X_unpad.T.shape[-1]
    X[i, :, :ts_len] = X_unpad.T

y = np.array(y)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
aaa_path = r"./MLSTM_FCN/data/aaa/"
np.save(aaa_path + 'X_train.npy', X_train)
np.save(aaa_path + 'y_train.npy', y_train)
np.save(aaa_path + 'X_test.npy', X_test)
np.save(aaa_path + 'y_test.npy', y_test)

In [None]:
# !.\venv37\Scripts\python.exe .\MLSTM_FCN\aaa_model.py

- inspect model weights?
- a smaller version?

## ROLLING VOTE

### one shot prediction

In [None]:
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
)
from datasets import load_dataset
import datasets

In [None]:
tokenizer_hf = AutoTokenizer.from_pretrained("ShreyaR/finetuned-roberta-depression")
model = AutoModelForSequenceClassification.from_pretrained("ranieri-unimi/test-trainer")

In [None]:
def predict(input_text):
    input_ids = tokenizer_hf.encode(input_text, return_tensors='pt')
    if input_ids.shape[-1] > 512:
        input_ids = torch.cat([input_ids[:,:511], input_ids[:,-1:]], dim=1)
    output = model(input_ids)[0]
    _, pred_label = output.max(1)
    return pred_label.cpu().detach().numpy()[0]

In [None]:
def batch_predict(input_texts):
    input_ids = tokenizer_hf.batch_encode_plus(input_texts, return_tensors='pt', padding=True, truncation=True)
    output = model(input_ids["input_ids"])[0]
    _, pred_labels = output.max(1)
    return pred_labels.cpu().detach().numpy()

In [None]:
# yt_hat = dict()

# for subj, ts in tqdm(x_text_ts.items()):
#     T = len(ts)
#     for i in range(0, T, 8):
#         predictions = batch_predict(ts)
#         try:
#             yt_hat[subj] = np.concatenate((yt_hat[subj], predictions), axis=None)
#         except:
#             yt_hat[subj] = predictions

In [None]:
# yt_hat = dict()

# for subj, ts in tqdm(x_text_ts.items()):
#     yt_hat[subj] = list()
#     for text in ts:
#         p = predict(text)
#         yt_hat[subj].append(p)

In [None]:
yt_hat = pickle.load(open("stash/all.prediction.pkl", "rb"))

### voting

In [None]:
ROLLING = 13
YIELD_THRESHOLD = 0.990
weight_window = np.ones(ROLLING) / ROLLING

In [None]:
y_hat = list()
for subj, ts in tqdm(yt_hat.items()):
    T = len(ts)
    result = 0
    pred_window = np.array(yt_hat[subj][: ROLLING - 1] + [0])
    for t in range(ROLLING - 1, T):
        pred_window[t % ROLLING] = yt_hat[subj][t]
        score = np.dot(pred_window, weight_window)
        if score >= YIELD_THRESHOLD:
            result = 1
            break
    y_hat.append([subj, result, t])


In [None]:
erde_mem(*list(zip(*[(y, y_true[subj], t) for subj, y, t in y_hat])), 50)

- hypertuning?
- explore refining methods on binary ts? HMM: https://pure.unileoben.ac.at/portal/files/1073252/Improving_Time_Series_Classification_Using_Hidden_Markov_Models.pdf

In [None]:
# pd.DataFrame(y_hat).to_csv("rollig_vote_results.csv", index=False, header=None, sep=" ")
# erde("rollig_vote_results.csv", 50)

## HIC SUNT LEONES

In [None]:
0/0

### ARIMA

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from  statsmodels.tsa.arima.model import ARIMA

In [None]:
k = categories.index("family")
s = "subject3414"

In [None]:
def ma(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w

In [None]:
ts = x_sign_lex_ts[s][:50,k]
ts = ma(ts, 3)
ts = x_lex_ts[s][:50,k]

In [None]:
model = ARIMA(ts, order=(1,3,2))
results = model.fit()
plt.plot(ts)
plt.plot(results.fittedvalues[1:], color='red', )