## Imports

In [1]:
import pandas as pd
import yaml
from pytorch_lightning import Trainer, seed_everything
from eyemind.trainer.loops import KFoldLoop
from eyemind.models.classifier import EncoderClassifierMultiSequenceModel
from eyemind.models.encoder_decoder import VariableSequenceLengthEncoderDecoderModel
from eyemind.dataloading.gaze_data import SequenceToLabelDataModule, SequenceToSequenceDataModule

  from .autonotebook import tqdm as notebook_tqdm


## Fixation Identification

## Comprehension Classification

In [2]:
name_to_cls = {"trainer": Trainer, "model": EncoderClassifierMultiSequenceModel, "data": SequenceToLabelDataModule}
def instantiate_class_from_config(config, cls_name):
    args = config[cls_name]
    return name_to_cls[cls_name](**args)

def instantiate_lightningmodules(config):
    modules = [instantiate_class_from_config(config, k) for k in name_to_cls.keys()]
    return tuple(modules)

In [3]:
configpath = "./experiment_configs/inf_multiseq_config.yml"

In [4]:
with open(configpath, 'r') as f:
    config = yaml.safe_load(f)

In [5]:
seed_everything(42, workers=True)
trainer, model, data = instantiate_lightningmodules(config)

Global seed set to 42
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


CNN Layers: [16, 32]
16 2
32 16




### Testing reproducibility of folds

In [7]:
data.setup()

In [8]:
data.train_dataset

<torch.utils.data.dataset.Subset at 0x178759790>

In [9]:
data.save_setup("setup_ROTE_X.yml")

In [10]:
loaded_data = instantiate_class_from_config(config, "data")

In [11]:
loaded_data.load_setup_path = "setup_ROTE_X.yml"
loaded_data.setup()

In [12]:
assert (set(loaded_data.train_dataset.indices) == set(data.train_dataset.indices))
assert (set(loaded_data.test_dataset.indices) == set(data.test_dataset.indices))

In [13]:
train_test_split = (data.train_dataset.dataset.get_files_from_indices(data.train_dataset.indices), data.test_dataset.dataset.get_files_from_indices(data.test_dataset.indices))

In [14]:
data.setup_folds(4)

In [15]:
data.save_folds("folds_rote_x.yml")

In [16]:
loaded_data.load_folds("folds_rote_x.yml")

In [17]:
[set(split_data[0]) == set(loaded_split[0]) for split_data, loaded_split in zip(data.splits, loaded_data.splits)]

[True, True, True, True]

In [18]:
file_folds = [(data.train_dataset.dataset.get_files_from_indices(split[0]),data.train_dataset.dataset.get_files_from_indices(split[1]) ) for split in data.splits]

In [19]:
file_folds

[(['EML1_087-Bias8.csv',
   'EML1_049-Validity8.csv',
   'EML1_069-Variables3.csv',
   'EML1_116-Hypotheses8.csv',
   'EML1_070-Variables8.csv',
   'EML1_018-Bias7.csv',
   'EML1_033-Validity1.csv',
   'EML1_069-Hypotheses5.csv',
   'EML1_025-CausalClaims6.csv',
   'EML1_125-CausalClaims6.csv',
   'EML1_020-Hypotheses1.csv',
   'EML1_111-CausalClaims6.csv',
   'EML1_048-Validity6.csv',
   'EML1_021-CausalClaims2.csv',
   'EML1_029-Variables5.csv',
   'EML1_083-Hypotheses8.csv',
   'EML1_015-Validity6.csv',
   'EML1_115-Validity6.csv',
   'EML1_136-Bias6.csv',
   'EML1_005-CausalClaims8.csv',
   'EML1_042-Validity6.csv',
   'EML1_103-CausalClaims8.csv',
   'EML1_106-Bias2.csv',
   'EML1_071-CausalClaims6.csv',
   'EML1_050-Variables8.csv',
   'EML1_072-Bias6.csv',
   'EML1_050-Bias2.csv',
   'EML1_012-Hypotheses8.csv',
   'EML1_030-Hypotheses8.csv',
   'EML1_036-Hypotheses4.csv',
   'EML1_036-CausalClaims6.csv',
   'EML1_014-CausalClaims3.csv',
   'EML1_036-Bias6.csv',
   'EML1_087-Hypo

In [20]:
def write_splits(splits, filepath, types="train_test"):
    if types == "train_test":
        split_out = {"train": splits[0], "test": splits[1]}
    else:
        #split_dict = {f"fold{i}": {"train": split[0], "val": split[1]} for i,split in enumerate(splits) }
        split_out = {"folds": [{"train": split[0], "val": split[1]} for split in splits]}
    with open(filepath, 'w') as f:
        yaml.dump(split_out, f)
    return splits

def load_file_folds(path):
    with open(path, 'r') as f:
        file_folds_dict = yaml.safe_load(f)    
    return [(fold["train"], fold["val"]) for fold in file_folds_dict["folds"]]

def compare_splits(filepath, splits, types="train_test"):
    with open(filepath, "r") as f:
        file_splits = yaml.safe_load(f)
    if types == "train_test":
        assert (len(splits[0]) == len(file_splits['train']))
        assert (set(splits[0]) == set(file_splits['train']))
    else:
        for i, split in enumerate(splits):
            for j,ds_type in enumerate(("train", "val")):
                #print(f"List Lengths: {len(split[i])}, {len(file_splits[f'fold{i}'][ds_type])}")
                file_set = set(file_splits[f"fold{i}"][ds_type])
                split_set = set(split[j])
                #print(len(split_set), len(file_set))
                assert(len(split_set) == len(file_set))
                assert(split_set == file_set)



In [21]:
# write_splits(train_test_split, "./data/inference_train_test_rotex_splits.yml", "train_test")
write_splits(file_folds, "./data/inference_train_val_rotex_folds.yml", "folds")

[(['EML1_087-Bias8.csv',
   'EML1_049-Validity8.csv',
   'EML1_069-Variables3.csv',
   'EML1_116-Hypotheses8.csv',
   'EML1_070-Variables8.csv',
   'EML1_018-Bias7.csv',
   'EML1_033-Validity1.csv',
   'EML1_069-Hypotheses5.csv',
   'EML1_025-CausalClaims6.csv',
   'EML1_125-CausalClaims6.csv',
   'EML1_020-Hypotheses1.csv',
   'EML1_111-CausalClaims6.csv',
   'EML1_048-Validity6.csv',
   'EML1_021-CausalClaims2.csv',
   'EML1_029-Variables5.csv',
   'EML1_083-Hypotheses8.csv',
   'EML1_015-Validity6.csv',
   'EML1_115-Validity6.csv',
   'EML1_136-Bias6.csv',
   'EML1_005-CausalClaims8.csv',
   'EML1_042-Validity6.csv',
   'EML1_103-CausalClaims8.csv',
   'EML1_106-Bias2.csv',
   'EML1_071-CausalClaims6.csv',
   'EML1_050-Variables8.csv',
   'EML1_072-Bias6.csv',
   'EML1_050-Bias2.csv',
   'EML1_012-Hypotheses8.csv',
   'EML1_030-Hypotheses8.csv',
   'EML1_036-Hypotheses4.csv',
   'EML1_036-CausalClaims6.csv',
   'EML1_014-CausalClaims3.csv',
   'EML1_036-Bias6.csv',
   'EML1_087-Hypo

In [22]:
load_file_folds("./data/inference_train_val_rotex_folds.yml")

[(['EML1_087-Bias8.csv',
   'EML1_049-Validity8.csv',
   'EML1_069-Variables3.csv',
   'EML1_116-Hypotheses8.csv',
   'EML1_070-Variables8.csv',
   'EML1_018-Bias7.csv',
   'EML1_033-Validity1.csv',
   'EML1_069-Hypotheses5.csv',
   'EML1_025-CausalClaims6.csv',
   'EML1_125-CausalClaims6.csv',
   'EML1_020-Hypotheses1.csv',
   'EML1_111-CausalClaims6.csv',
   'EML1_048-Validity6.csv',
   'EML1_021-CausalClaims2.csv',
   'EML1_029-Variables5.csv',
   'EML1_083-Hypotheses8.csv',
   'EML1_015-Validity6.csv',
   'EML1_115-Validity6.csv',
   'EML1_136-Bias6.csv',
   'EML1_005-CausalClaims8.csv',
   'EML1_042-Validity6.csv',
   'EML1_103-CausalClaims8.csv',
   'EML1_106-Bias2.csv',
   'EML1_071-CausalClaims6.csv',
   'EML1_050-Variables8.csv',
   'EML1_072-Bias6.csv',
   'EML1_050-Bias2.csv',
   'EML1_012-Hypotheses8.csv',
   'EML1_030-Hypotheses8.csv',
   'EML1_036-Hypotheses4.csv',
   'EML1_036-CausalClaims6.csv',
   'EML1_014-CausalClaims3.csv',
   'EML1_036-Bias6.csv',
   'EML1_087-Hypo

In [23]:
# compare_splits("./data/inference_train_val_rotex_folds.yml", file_folds, types="folds")
# compare_splits("./data/inference_train_test_rotex_splits.yml", train_test_split)

### Nested CV Testing

In [6]:
data.setup()

In [7]:
data.setup_cv_folds(4,4)

In [11]:
data.setup_cv_fold_index(0,0)

In [14]:
set(data.train_fold.indices) == set(data.inner_splits[0][0][0])

False

In [16]:
dict = {"seq": 2, "l":3}
list(dict.keys())

['seq', 'l']

In [23]:
import torch
ckpt_dir = "./ray_results/fixation_tuning/train_tune_c2ac0_00001_1_hidden_dim=256,sequence_length=250_2022-05-31_16-41-19/checkpoint_epoch=39-step=11160/checkpoint"
model = VariableSequenceLengthEncoderDecoderModel.load_from_checkpoint(ckpt_dir)
torch.save(model.encoder.state_dict(), "./pretrained_models/encoder_fixation_tuning_seq=250_hidden_dim=256.pt")

In [26]:
model.encoder.state_dict()

OrderedDict([('0.cnn.0.conv.weight',
              tensor([[[-1.8878e-01,  1.4104e-01,  2.3606e-01, -7.8393e-02, -4.4634e-02,
                         2.4333e-01, -2.4416e-01],
                       [ 7.6704e-02,  2.2949e-02, -1.9121e-01,  1.5222e-01, -1.8657e-01,
                        -1.5317e-01,  3.5314e-01]],
              
                      [[ 2.7781e-01,  1.3024e-02,  4.1417e-02, -2.3221e-01,  4.8440e-02,
                        -2.1176e-01, -1.9660e-01],
                       [ 1.3972e-01,  2.1775e-01, -2.1982e-01,  1.1675e-01,  2.0925e-01,
                         1.2560e-01, -1.9022e-01]],
              
                      [[-1.3565e-01, -5.7570e-02, -4.4677e-02,  1.3821e-01, -2.2621e-01,
                        -2.4091e-01, -6.7084e-02],
                       [-2.5115e-02,  2.6992e-01, -2.5302e-01,  2.1725e-01, -2.4802e-01,
                         2.7169e-01,  8.5340e-02]],
              
                      [[ 2.9709e-01,  2.6135e-01, -2.4029e-01, -1.1042e-01,

In [24]:
EncoderClassifierMultiSequenceModel(encoder_hidden_dim=256, encoder_weights_path="./pretrained_models/encoder_fixation_tuning_seq=250_hidden_dim=256.pt")

Loading:  ./pretrained_models/encoder_fixation_tuning_seq=250_hidden_dim=256.pt


KeyError: -1

### Loading Folds

In [24]:
configpath = "./experiment_configs/fix_multiseq_config.yml"
with open(configpath, 'r') as f:
    config = yaml.safe_load(f)

In [25]:
name_to_cls = {"trainer": Trainer, "model": VariableSequenceLengthEncoderDecoderModel, "data": SequenceToSequenceDataModule}
trainer, model, data = instantiate_lightningmodules(config)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


CNN Layers: [16, 32]
16 2
32 16




In [26]:
data.load_setup_path = "setup_ROTE_X.yml"
data.setup()

In [27]:
data.train_dataset

<torch.utils.data.dataset.Subset at 0x178bafac0>

In [28]:
data.load_folds("folds_rote_x.yml")

In [31]:
num_folds = len(data.splits)

In [34]:
default_fit_loop = trainer.fit_loop
trainer.fit_loop = KFoldLoop(num_folds)
trainer.fit_loop.connect(default_fit_loop)

In [35]:
trainer

<pytorch_lightning.trainer.trainer.Trainer at 0x1790592b0>

In [36]:
trainer.fit(model, datamodule=data)


  | Name            | Type             | Params
-----------------------------------------------------
0 | encoder         | Sequential       | 163 K 
1 | decoder         | RNNDecoder       | 198 K 
2 | criterion       | CrossEntropyLoss | 0     
3 | auroc_metric    | AUROC            | 0     
4 | accuracy_metric | Accuracy         | 0     
-----------------------------------------------------
362 K     Trainable params
0         Non-trainable params
362 K     Total params
1.448     Total estimated model params size (MB)


AttributeError: 'SequenceToSequenceDataModule' object has no attribute 'label_col'

### Test Metrics

In [None]:
data.setup()

In [None]:
res = trainer.test_loop.run()

In [None]:
res

In [None]:
label_df = pd.read_csv("./data/EML1_pageLevel_with_filename.csv", index_col=0)

In [None]:
from eyemind.dataloading.load_dataset import get_filenames_for_dataset, label_files
label_df = pd.read_csv("./data/EML1_pageLevel_with_filename_seq.csv")
files1 = sorted(get_filenames_for_dataset("./data/processed/output", label_df, "Rote_X"))
labels = label_files(label_df, "Rote_X", files1)
print(list(zip(files1, labels)))
label_df[["Rote_X","filename"]].loc[label_df["filename"].isin([file.split(".")[0] for file in files1])]


In [None]:
new_df = add_sequence_col(label_df, "./data/processed/output")

In [None]:
new_df.to_csv("./data/EML1_pageLevel_with_filename_seq.csv")

In [None]:
new_df.loc[(~new_df["Rote_X"].isna()) & (new_df["sequence_length"] >500)]

In [None]:
rote_x_label_df = new_df.loc[(~new_df["Rote_X"].isna()) & (new_df["sequence_length"] >500)]

In [None]:
rote_x_label_df['Rote_X'].value_counts()

In [None]:
312/894


In [None]:
import torch


In [None]:
a = torch.arange(40).reshape(4,5,2)


In [None]:
a

In [None]:
X = a.split(2, dim=1)[:-1]
print(X)

In [None]:
torch.stack(X,1)[0].shape

In [None]:
from eyemind.obf.model import creator
import torch

In [None]:
# Have to add path to enable torch.load to work since they saved it weirdly
import sys
from pathlib import Path
sys.path.append(str(Path("../obf").resolve()))

In [None]:
enc = creator.load_encoder("./OBF/pre_weights/sample_weights/encoder_1633040995_gru.pt")

In [None]:
enc

In [None]:
torch.save(enc.state_dict(), "./pretrained_models/obf_encoder_conv_gru.pt")

In [None]:
from eyemind.models.classifier import create_encoder

encoder = create_encoder()

In [None]:
encoder.load_state_dict(torch.load("./pretrained_models/obf_encoder_conv_gru.pt"))

In [None]:
encoder.parameters()

In [None]:
for p_1, p_2 in zip(encoder.parameters(),enc.parameters()):
    assert torch.equal(p_1,p_2)