In [34]:
import torch
from torch import nn
from torch.utils.data import Dataset
from pathlib import Path
from tqdm import tqdm
import numpy as np
import accuracy

from torcheval.metrics.functional import multiclass_f1_score

import warnings
warnings.filterwarnings("error")

In [35]:
device=torch.device("cuda")

In [37]:
class MyModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.lstm1 = nn.LSTM(1024, 512)
        self.lstm2 = nn.LSTM(512, 128)
        self.linear = nn.Linear(128, 2)
        self.act1 = nn.ReLU()

    def forward(self, x):
        l1, h1 = self.lstm1(x)
        z0 = self.act1(l1)
        l2, h2 = self.lstm2(z0)
        z0 = self.act1(l2)
        out = self.linear(z0)
        return out
    

model = MyModel()
model.load_state_dict(torch.load("model_0.945.ckpt")['state_dict'])
model.to(device)

MyModel(
  (lstm1): LSTM(1024, 512)
  (lstm2): LSTM(512, 128)
  (linear): Linear(in_features=128, out_features=2, bias=True)
  (act1): ReLU()
)

In [4]:
class CustomDataset(Dataset):
    def __init__(self, filelist, max_pad=30) -> None:
        super().__init__()
        self.filelist = filelist
        self.max_pad = max_pad
    
    def __len__(self):
        return len(self.filelist)

    def __getitem__(self, index):
        emb, labs = self._get_random_data()
        return emb, labs
    
    def _get_random_data(self):
        nfiles=torch.randint(1, 30, (1,)).item()
        embs=[]
        labs=[]
        for file in range(nfiles):
            randid = torch.randint(0, len(self.filelist), (1,)).item()
            d=torch.load(self.filelist[randid])
            embs.append(d['embeddings'].cpu())
            lab = [0]*d['embeddings'].shape[0]
            lab[-1]=1
            labs.extend(lab)
        # for minusfile in range(self.max_pad - nfiles):
        #     embs.append(torch.zeros_like(d['embeddings'].cpu()).cpu())
        embs = torch.cat(embs)
        labs = torch.tensor(labs)
        return embs, labs

In [5]:
filelist = list(Path("preprocessed_embeddings/en").glob("**/*.pt")); print(len(filelist))
dataset = CustomDataset(filelist=filelist)
d = dataset[0]
out = model(d[0].unsqueeze(0).to(device))
d[0].shape, d[1].shape, out.shape

151955


(torch.Size([273, 1024]), torch.Size([273]), torch.Size([1, 273, 2]))

In [7]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [8]:
class Accuracies(object):
    def __init__(self):
        self.thresholds = np.arange(0, 1, 0.05)
        self.accuracies = {k: accuracy.Accuracy() for k in self.thresholds}

    def update(self, output_np, targets_np):
        current_idx = 0
        for k, t in enumerate(targets_np):
            document_sentence_count = len(t)
            to_idx = int(current_idx + document_sentence_count)

            for threshold in self.thresholds:
                output = ((output_np[current_idx: to_idx, :])[:, 1] > threshold)
                h = np.append(output, [1])
                tt = np.append(t, [1])

                self.accuracies[threshold].update(h, tt)

            current_idx = to_idx

    def calc_accuracy(self):
        min_pk = np.inf
        min_threshold = None
        min_epoch_windiff = None
        for threshold in self.thresholds:
            epoch_pk, epoch_windiff = self.accuracies[threshold].calc_accuracy()
            if epoch_pk < min_pk:
                min_pk = epoch_pk
                min_threshold = threshold
                min_epoch_windiff = epoch_windiff

        return min_pk, min_epoch_windiff, min_threshold

In [8]:
dl = torch.utils.data.DataLoader(dataset, batch_size=16, num_workers=4)
d = next(iter(dl))
d[0].shape, d[1].shape

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/operation/miniconda/envs/dev/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/operation/miniconda/envs/dev/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/home/operation/miniconda/envs/dev/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 265, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/home/operation/miniconda/envs/dev/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 142, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/operation/miniconda/envs/dev/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 142, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/operation/miniconda/envs/dev/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 119, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/home/operation/miniconda/envs/dev/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 162, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [284, 1024] at entry 0 and [382, 1024] at entry 1


In [38]:
import segeval

In [65]:
acc = Accuracies()
acc.update(torch.randint(0, 10, (1, 10)).numpy(), torch.randint(0, 10, (1, 10)).numpy())

SegmentationMetricError: Reference and hypothesis segmentations differ in position length (11 is not 2).

In [39]:
epochs=3
running_loss=0.0
score=0
model.to(device)
# loss_fn.to("cpu")

val_files = Path(f"/home/operation/projects/notebooks/validation/").glob("*.pt")
for epoch in range(epochs):
    prog = tqdm(enumerate(dataset), total=len(dataset))
    avg_f1 = []
    losses = []
    for idx, d in prog:
        try:
            emb, lab = d
            emb = emb.unsqueeze(0).to(device)
            lab = lab.to(device)
            optimizer.zero_grad()

            outputs = nn.functional.softmax(model(emb), -1)

            loss = loss_fn(outputs.squeeze(0).cpu(), lab.cpu())
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            losses.append(loss.item())
            f1 = multiclass_f1_score(outputs.squeeze(0).argmax(-1), lab, num_classes=2, average="macro")
            avg_f1.append(f1.item())

            model.to(device)
            if idx%1000==0:
                for vfile in val_files:
                    d = torch.load(vfile)
                    embb = d['embeddings'].to(device)
                    labs = d['labels'].to(device)
                    with torch.no_grad():
                        outs = nn.functional.softmax(model(embb.unsqueeze(0)), -1)
                    score = multiclass_f1_score(outs.squeeze(0).argmax(-1), labs, num_classes=2, average="macro")
            if idx%1000==0:
                torch.save({"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}, f"./model_{epoch}.ckpt")
            prog.set_postfix({"idx": idx, "loss": torch.mean(torch.tensor(losses[-100:])).item(), "f1": torch.mean(torch.tensor(avg_f1[-100:])).item(), "val_score": score.item()})
            break
        except Exception as e:
            print(e)
            break

  0%|          | 0/151955 [00:00<?, ?it/s]

  0%|          | 0/151955 [00:00<?, ?it/s, idx=0, loss=0.338, f1=0.894, val_score=0.857]
  0%|          | 0/151955 [00:00<?, ?it/s, idx=0, loss=0.313, f1=1, val_score=0.857]
  0%|          | 0/151955 [00:00<?, ?it/s, idx=0, loss=0.34, f1=0.893, val_score=0.857]


In [67]:
labs.shape, outs.argmax(-1)

torch.Size([9932])

In [43]:
print(labs[:100], outs.argmax(-1)[:100])

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0], device='cuda:0') tensor([[0, 0, 0,  ..., 1, 0, 1]], device='cuda:0')


In [46]:
.shape

torch.Size([9932])

tensor([  12,   25,   26,   48,   68,   79,  102,  106,  119,  138,  147,  160,
         163,  175,  197,  209,  230,  269,  272,  275,  295,  311,  317,  332,
         338,  362,  373,  379,  380,  381,  387,  389,  390,  419,  430,  443,
         451,  459,  460,  477,  485,  486,  488,  503,  519,  536,  558,  560,
         582,  595,  607,  627,  652,  673,  692,  707,  723,  724,  735,  739,
         756,  771,  783,  806,  822,  837,  841,  848,  861,  878,  895,  897,
         900,  913,  935,  938,  960,  978, 1003, 1006, 1027, 1051, 1052, 1054,
        1078, 1086, 1095, 1101, 1111, 1117, 1137, 1159, 1178, 1185, 1187, 1189,
        1191, 1200, 1204, 1206, 1207, 1215, 1219, 1226, 1227, 1237, 1256, 1280,
        1290, 1293, 1308, 1320, 1338, 1359, 1375, 1394, 1410, 1424, 1431, 1446,
        1458, 1471, 1494, 1497, 1515, 1535, 1564, 1584, 1610, 1631, 1648, 1651,
        1652, 1667, 1686, 1690, 1700, 1710, 1732, 1745, 1748, 1752, 1753, 1759,
        1770, 1775, 1782, 1792, 1807, 18

In [60]:
a = [2,4,6,8,10]
b = [2,4,8,10]

segeval.pk(a, b)

SegmentationMetricError: Reference and hypothesis segmentations differ in position length (24 is not 30).

In [12]:
segdata = segeval.HEARST_1997_STARGAZER
segdata

Dataset(dict,
        {'stargazer': {'1': (2, 3, 3, 1, 3, 6, 3),
          '2': (2, 8, 2, 4, 2, 3),
          '3': (2, 1, 2, 3, 1, 3, 1, 3, 2, 2, 1),
          '4': (2, 1, 4, 1, 1, 3, 1, 4, 3, 1),
          '5': (3, 2, 4, 3, 5, 4),
          '6': (2, 3, 4, 2, 2, 5, 3),
          '7': (2, 3, 2, 2, 3, 1, 3, 2, 3)}})

In [13]:
segeval.boundary_similarity(segdata['stargazer']['1'], segdata['stargazer']['2'])

Decimal('0.5')

In [31]:
for vfile in val_files:
    d = torch.load(vfile)
    embb = d['embeddings'].to(device)
    labs = d['labels'].to(device)
    with torch.no_grad():
        outs = nn.functional.softmax(model(embb.unsqueeze(0)), -1)
    score = multiclass_f1_score(outs.squeeze(0).argmax(-1), labs, num_classes=2)
    print([(i,j) for i,j in zip(labs.cpu(), outs.squeeze(0).argmax(-1).cpu())])

In [35]:
print([(i.item(),j.item()) for i,j in zip(labs.cpu(), outs.argmax(-1).squeeze(0).cpu())])

[(0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0),

In [42]:
multiclass_f1_score(outs.argmax(-1).squeeze(0).cpu(), labs.cpu(), num_classes=2, average="macro")

tensor(0.4862)

In [23]:
for o,a in zip(nn.functional.softmax(outputs, -1).argmax(-1)[0], lab):
    print(o.item(), a.item())

0 0
1 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
1 0
1 0
0 1
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
1 0
1 0
0 0
0 0
1 0
0 0
1 0
0 0
0 0
0 0
0 0
1 0
1 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
1 0
1 0
0 0
0 0
0 0
0 0
0 0
1 0
1 0
0 0
0 1
0 0
1 0
0 0
0 0
0 0
1 0
0 0
0 0
1 0
1 0
0 0
1 0
0 0
1 0
0 0
1 0
0 0
1 0
1 0
0 1
0 0
1 0
1 0
1 0
1 0
1 0
1 0
0 0
1 0
0 0
1 0
1 0
1 1
1 0
1 0
0 1
1 0
0 0
0 0
0 0
0 0
1 0
1 0
1 0
1 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 1
0 0
0 0
1 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
0 0
0 0
0 0
0 0
1 0
1 0
0 0
1 0
0 0
1 0
0 0
0 0
1 0
1 0
1 0
0 0
1 0
0 0
0 0
0 1
1 0
0 0
0 0
1 0
1 0
0 0
0 0
1 0
0 0
0 0
1 0
1 0
0 0
0 0
1 0
0 0
1 0
1 0
1 0
0 0
1 0
1 0
0 0
1 0
1 0
0 0
0 0
1 0
1 1
1 0
1 0
1 0
1 0
1 0
0 0
1 0
0 1
0 0
1 0
1 0
0 0
1 0
1 0
1 0
1 0
1 0
0 1
1 0
1 0
1 0
0 0
0 0
0 0
1 0
1 0
1 0
1 0
1 0
0 0
1 1
1 0
1 0
1 0
1 0
1 0
0 0
1 0


In [155]:
acc = Accuracies()
acc.update(outputs.detach().cpu().numpy(), lab.unsqueeze(0).detach().cpu().numpy())

SegmentationMetricError: Reference and hypothesis segmentations differ in position length (215 is not 3).

In [150]:
nn.functional.softmax(outputs, 2).argmax(2)

tensor([[0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
         1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
         0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
         1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
         0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
         1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
         1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
         0, 0, 1, 0, 0, 0, 0