# Get percentage of fixations for us and for them AUPRC
# Start at a fixed point
# Shorten sequence length

1. Results table 
2. Fine-tuning their model on fixations
3. Regression 
4. Identify if it is the same text or participant
4. Look at gaze papers 

## Imports

In [1]:
# Import obf functionality
import sys
from pathlib import Path
sys.path.append(str(Path("../../OBF").resolve()))

from obf.model import ae
from obf.model import creator


In [2]:
from eyemind.dataloading.load_dataset import limit_sequence_len, get_label_mapper, get_filenames_for_dataset, create_filename_col, get_stratified_group_splits
from eyemind.dataloading.gaze_data import GazeDataModule
from eyemind.models.classifier import EncoderClassifierModel
import pandas as pd
import torch
import numpy as np
from torch.utils.data import SubsetRandomSampler, DataLoader
from pytorch_lightning import Trainer


## DataLoading

In [3]:
data_folder = Path("/Users/rickgentry/emotive_lab/eyemind/data/processed/fixation")

In [4]:
def fixation_label_mapper(files):
    labels = []
    for f in files:
        df = pd.read_csv(f)
        label_array = df['fixation_label'].to_numpy(float)
        labels.append(label_array)
    return labels


In [5]:
def limit_label_seq(y_data, sequence_length, pad_token=-1.):
    if len(y_data) > sequence_length:
        y_data = y_data[:sequence_length]
    else:
        pad_data = np.ones((sequence_length,)) * pad_token
        pad_data[:len(y_data)] = y_data
        y_data = pad_data
    return y_data



In [6]:
from functools import partial
sequence_len = 500
lim_seq_len = partial(limit_sequence_len, sequence_len=sequence_len, random_part=False)
limit_labels = partial(limit_label_seq, sequence_length=sequence_len)

In [7]:
transforms = [lim_seq_len,lambda data: torch.tensor(data).float()]
dm = GazeDataModule(data_folder, label_mapper=fixation_label_mapper, transform_x=transforms, transform_y=[limit_labels,lambda data: torch.tensor(data).float()])

In [10]:
dm.setup(stage="test")

6279 6279


In [33]:
test_dl = dm.test_dataloader(shuffle=True)

In [19]:
# Get fixation counts
fixation_count = 0
total_labels = 0

for filepath in Path.glob(data_folder, "*.csv"):
    df = pd.read_csv(filepath)
    total_labels += len(df['fixation_label'])
    fixation_count += df['fixation_label'].sum()
print(total_labels)
print(fixation_count)

12232686
9716617.0


In [34]:
inv_ratio_fix = 1 / (fixation_count / (total_labels - fixation_count))
inv_ratio_sacc = 1 / ((total_labels - fixation_count)/ fixation_count)
print(inv_ratio_fix, inv_ratio_sacc)

0.25894495995880046 3.8618245366084953


In [31]:
inv_ratios = np.array([inv_ratio_sacc, inv_ratio_fix])
inv_ratios / inv_ratios.sum()

array([0.93716102, 0.06283898])

In [33]:
fixation_count / (total_labels - fixation_count)

3.8618245366084953

In [23]:
4.53 / 4.53 + 0.18

1.18

In [24]:
0.18 /  4.53 + 0.18

0.21973509933774832

In [None]:
total_labels = 0
fixation_count = 0
for _, labels in test_dl:
    
    fixation_count += labels.sum()
    total_labels += torch.numel(labels)
print()

## Model

In [34]:
pre_trained_weights_dir = Path("../OBF/pre_weights/sample_weights")

In [35]:
encoder = creator.load_encoder(str(pre_trained_weights_dir.resolve()))

Loading:  /Users/rickgentry/emotive_lab/eyemind/OBF/pre_weights/sample_weights/encoder_1633040995_gru.pt


In [11]:
print(encoder)

Sequential(
  (0): CNNEncoder(
    (cnn): Sequential(
      (0): ConvBlock(
        (conv): Conv1d(2, 14, kernel_size=(7,), stride=(1,), padding=(3,))
        (pool_layer): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
        (bn): BatchNorm1d(14, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): LeakyReLU(negative_slope=0.01)
      )
      (1): ConvBlock(
        (conv): Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(3,))
        (bn): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (activation): LeakyReLU(negative_slope=0.01)
      )
    )
  )
  (1): RNNEncoder(
    (rnn): GRU(32, 128, num_layers=2, batch_first=True, dropout=0.1)
  )
)


In [12]:
fi_decoder = torch.load(str(Path(pre_trained_weights_dir, "fi_1633040995_gru.pt").resolve()),map_location=torch.device('cpu'))


In [13]:
from torchsummary import summary
print(fi_decoder)


RNNDecoder(
  (rnn): GRU(128, 128, num_layers=2, batch_first=True, dropout=0.1)
  (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (out_fc): Linear(in_features=128, out_features=2, bias=True)
)


In [14]:
class FixationIdentifier(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, x):
        embeddings = self.encoder(x)
        output = self.decoder(embeddings)
        return output
        

In [15]:
fid_model = FixationIdentifier(encoder, fi_decoder)

In [16]:
print(fid_model)

FixationIdentifier(
  (encoder): Sequential(
    (0): CNNEncoder(
      (cnn): Sequential(
        (0): ConvBlock(
          (conv): Conv1d(2, 14, kernel_size=(7,), stride=(1,), padding=(3,))
          (pool_layer): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
          (bn): BatchNorm1d(14, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): LeakyReLU(negative_slope=0.01)
        )
        (1): ConvBlock(
          (conv): Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(3,))
          (bn): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (activation): LeakyReLU(negative_slope=0.01)
        )
      )
    )
    (1): RNNEncoder(
      (rnn): GRU(32, 128, num_layers=2, batch_first=True, dropout=0.1)
    )
  )
  (decoder): RNNDecoder(
    (rnn): GRU(128, 128, num_layers=2, batch_first=True, dropout=0.1)
    (bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   

In [38]:
batch = next(iter(test_dl))

In [39]:
x,y = batch

In [58]:
logits = fid_model(x)

In [60]:
logits.softmax(dim=2).shape

torch.Size([8, 500, 2])

In [45]:
preds = logits.argmax(dim=2).detach().cpu().numpy().reshape(-1)

In [61]:
probs = logits.softmax(dim=2)[:,:,1].detach().cpu().numpy().reshape(-1)

In [64]:
y = y.cpu().numpy().reshape(-1)

In [66]:
auc = metrics.roc_auc_score(y, probs)

In [67]:
auc

0.6506709492371336

In [68]:
acc = metrics.accuracy_score(y, preds)


In [69]:
acc

0.73525

In [93]:
import sklearn.metrics as metrics
import tqdm

def evaluate(dl, model):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    for x, y in tqdm.tqdm(dl):
        logits = model(x)
        preds = logits.argmax(dim=2).detach().cpu().numpy().reshape(-1)
        probs = logits.softmax(dim=2)[:,:,1].detach().cpu().numpy().reshape(-1)
        y = y.cpu().numpy().reshape(-1)
        all_preds += preds.tolist()
        all_labels += y.tolist()
        all_probs += probs.tolist()
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)
    all_preds = np.array(all_preds)
    masked_indices = all_labels == -1
    acc = metrics.accuracy_score(all_labels[~masked_indices], all_preds[~masked_indices])
    auc = metrics.roc_auc_score(all_labels[~masked_indices], all_probs[~masked_indices])
    return acc, auc





In [72]:
preds, labels, probs = evaluate(test_dl, fid_model)

100%|██████████| 784/784 [08:00<00:00,  1.63it/s]


In [91]:
preds = np.array(preds)
acc = metrics.accuracy_score(labels[~indices_masked], preds[~indices_masked])


In [92]:
acc

0.7467187887901037

In [81]:
labels = np.array(labels)

In [86]:
probs = np.array(probs)

In [88]:
auc = metrics.roc_auc_score(labels[~indices_masked], probs[~indices_masked])

In [89]:
auc

0.6525304778413941