# flan-t5-base 

peft ia3, top-k titles based upon `'\n\n'.join([ ex['ref1'], ex['ref2'] ])`

In [None]:
def peft_t5_baselines(k):
    from MegaT5 import PeftT5Classifier
    from PersonalizedCitation import train_loader, dev_loader
    from ProgressPrinter import ProgressPrinter
    from peft import IA3Config, TaskType, prepare_model_for_kbit_training
    from transformers import T5ForConditionalGeneration
    import torch

    device = 'cuda'
    torch.set_default_device(device)
    torch.manual_seed(2112)

    train = train_loader(batch_size=4)
    dev = dev_loader(batch_size=4)

    def interleave(a, b):
        from math import inf
        
        atot, btot = a.num_examples, b.num_examples
        aiter, biter = a.__iter__(), b.__iter__()
        aelem, belem = next(aiter), next(biter)
        anum, bnum = 1, 1

        while anum != inf and bnum != inf:
            if anum * btot <= bnum * atot:
                yield (True, aelem)
                try:
                    aelem = next(aiter)
                    anum += 1
                except StopIteration:
                    anum = inf
            else:
                yield (False, belem)
                try:
                    belem = next(biter)
                    bnum += 1
                except StopIteration:
                    bnum = inf

    peft_config = IA3Config(task_type=TaskType.SEQ_2_SEQ_LM)
    t5 = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
    fewshot = PeftT5Classifier(train.num_labels, peft_config, t5=t5)

    with ProgressPrinter('iter', f'{k} loss', f'{k} acc', f'{k} acc (dev)') as printer:
        for iteration in range(2):
            for istrain, (examples, labels) in interleave(train, dev):
                with torch.no_grad():
                    inputs = []
                    target = torch.Tensor([ int(label == train.choices[1]) for label in labels ]).long().to(device)
    
                    for ex in examples:
                        embeddings = train.embed([ '\n\n'.join([ ex['ref1'], ex['ref2'] ]) ] + 
                                                 [ v['title'] 
                                                   for v in ex['profile']
                                                   if v['title'] != ex['title'] 
                                                 ])
                        index = torch.topk(embeddings[0,:] @ embeddings[1:,:].T, dim=0, k=k).indices.to('cpu')
                        titles = [ f'"{ex["profile"][ind]["title"]}"' for ind in index.tolist() ]
                        concat_titles = ' and '.join(titles)
                        input = train.append_to_title(ex, concat_titles)
                        inputs.append(input)
    
                    fewshotacc = (fewshot.predict(inputs).argmax(dim=1) == target).float().mean().item()

                fewloss = fewshot.learn(inputs, target) if istrain else None
                printer.addobs(iteration, fewloss, fewshotacc if istrain else None, fewshotacc if not istrain else None)

            printer.print()
            printer.autoprint = False

 
from Fork import SubProcess
for k in range(0, 5):
    with SubProcess() as process: process.parent or peft_t5_baselines(k)

n                  iter       since      0 loss       since       0 acc       since 0 acc (dev)       since      dt (s)
1                     0           0       0.697       0.697         0.5         0.5           0           0       0.865
2                     0           0       0.775       0.853       0.375        0.25           0           0        1.15
4                     0           0       0.866        1.05       0.417         0.5        0.25        0.25        1.56
8                     0           0       0.829       0.801         0.5       0.562        0.25           0        2.61
16                    0           0       0.784       0.732       0.481       0.458        0.25        0.25        4.64
32                    0           0       0.741       0.697       0.519       0.558         0.5        0.75        8.67
64                    0           0       0.737       0.732        0.51         0.5       0.462       0.429        16.7
128                   0           0     

peft ia3, top-k titles based upon max similarity with ref1 and ref2

In [None]:
def peft_t5_baselines(k):
    from MegaT5 import PeftT5Classifier
    from PersonalizedCitation import train_loader, dev_loader
    from ProgressPrinter import ProgressPrinter
    from peft import IA3Config, TaskType, prepare_model_for_kbit_training
    from transformers import T5ForConditionalGeneration
    import torch

    device = 'cuda'
    torch.set_default_device(device)
    torch.manual_seed(2112)

    train = train_loader(batch_size=2)
    dev = dev_loader(batch_size=2)

    def interleave(a, b):
        from math import inf
        
        atot, btot = a.num_examples, b.num_examples
        aiter, biter = a.__iter__(), b.__iter__()
        aelem, belem = next(aiter), next(biter)
        anum, bnum = 1, 1

        while anum != inf and bnum != inf:
            if anum * btot <= bnum * atot:
                yield (True, aelem)
                try:
                    aelem = next(aiter)
                    anum += 1
                except StopIteration:
                    anum = inf
            else:
                yield (False, belem)
                try:
                    belem = next(biter)
                    bnum += 1
                except StopIteration:
                    bnum = inf

    peft_config = IA3Config(task_type=TaskType.SEQ_2_SEQ_LM)
    t5 = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
    fewshot = PeftT5Classifier(train.num_labels, peft_config, t5=t5)

    with ProgressPrinter('iter', f'{k} loss', f'{k} acc', f'{k} acc (dev)') as printer:
        for iteration in range(2):
            for istrain, (examples, labels) in interleave(train, dev):
                with torch.no_grad():
                    inputs = []
                    target = torch.Tensor([ int(label == train.choices[1]) for label in labels ]).long().to(device)
    
                    for ex in examples:
                        embeddings = train.embed( [ ex['ref1'], ex['ref2'] ] + 
                                                  [ v['title'] 
                                                   for v in ex['profile']
                                                   if v['title'] != ex['title'] 
                                                 ])
                        scores = torch.max(embeddings[[0,1],:] @ embeddings[2:,:].T, dim=0).values
                        index = torch.topk(scores, dim=0, k=k).indices.to('cpu')
                        titles = [ f'"{ex["profile"][ind]["title"]}"' for ind in index.tolist() ]
                        concat_titles = ' and '.join(titles)
                        input = train.append_to_title(ex, concat_titles)
                        inputs.append(input)
    
                    fewshotacc = (fewshot.predict(inputs).argmax(dim=1) == target).float().mean().item()

                fewloss = fewshot.learn(inputs, target) if istrain else None
                printer.addobs(iteration, fewloss, fewshotacc if istrain else None, fewshotacc if not istrain else None)

            printer.print()
            printer.autoprint = False

 
from Fork import SubProcess
for k in range(0, 5):
    with SubProcess() as process: process.parent or peft_t5_baselines(k)

n                  iter       since      0 loss       since       0 acc       since 0 acc (dev)       since      dt (s)
1                     0           0       0.674       0.674           1           1           0           0       0.797
2                     0           0        1.44        2.21         0.5           0           0           0        1.02
4                     0           0        1.23       0.799       0.333           0           0           0        1.34
8                     0           0        0.92        0.69         0.5       0.625           0           0        2.19
16                    0           0       0.881       0.835       0.423       0.333         0.5        0.75        3.68
32                    0           0         0.8       0.718         0.5       0.577       0.583       0.667        6.93
64                    0           0       0.747       0.691       0.539        0.58       0.731       0.857        13.1
128                   0           0     

# flan-t5-xl (8bit)

peft ia3, top-k titles based upon `'\n\n'.join([ ex['ref1'], ex['ref2'] ])`

better than flan-t5-base.  unfortunately flan-t5-xxl doesn't fit on a T4 in 8bit, and 4bit doesn't seem to work.

In [2]:
def peft_t5_baselines(k):
    from MegaT5 import PeftT5Classifier
    from PersonalizedCitation import train_loader, dev_loader
    from ProgressPrinter import ProgressPrinter
    from peft import IA3Config, TaskType, prepare_model_for_kbit_training
    from transformers import T5ForConditionalGeneration
    import torch
    import warnings

    device = 'cuda'
    torch.set_default_device(device)
    torch.manual_seed(2112)

    train = train_loader(batch_size=4)
    dev = dev_loader(batch_size=4)

    def interleave(a, b):
        from math import inf
        
        atot, btot = a.num_examples, b.num_examples
        aiter, biter = a.__iter__(), b.__iter__()
        aelem, belem = next(aiter), next(biter)
        anum, bnum = 1, 1

        while anum != inf and bnum != inf:
            if anum * btot <= bnum * atot:
                yield (True, aelem)
                try:
                    aelem = next(aiter)
                    anum += 1
                except StopIteration:
                    anum = inf
            else:
                yield (False, belem)
                try:
                    belem = next(biter)
                    bnum += 1
                except StopIteration:
                    bnum = inf

    peft_config = IA3Config(task_type=TaskType.SEQ_2_SEQ_LM)
    t5 = prepare_model_for_kbit_training(T5ForConditionalGeneration.from_pretrained('google/flan-t5-xl', load_in_8bit=True))
    fewshot = PeftT5Classifier(train.num_labels, peft_config, t5=t5)

    with ProgressPrinter('iter', f'{k} loss', f'{k} acc', f'{k} acc (dev)') as printer, warnings.catch_warnings():
        warnings.filterwarnings("ignore", message=".*MatMul8bitLt.*")
        
        for iteration in range(2):
            for istrain, (examples, labels) in interleave(train, dev):
                with torch.no_grad():
                    inputs = []
                    target = torch.Tensor([ int(label == train.choices[1]) for label in labels ]).long().to(device)
    
                    for ex in examples:
                        embeddings = train.embed([ '\n\n'.join([ ex['ref1'], ex['ref2'] ]) ] + 
                                                 [ v['title'] 
                                                   for v in ex['profile']
                                                   if v['title'] != ex['title'] 
                                                 ])
                        index = torch.topk(embeddings[0,:] @ embeddings[1:,:].T, dim=0, k=k).indices.to('cpu')
                        titles = [ f'"{ex["profile"][ind]["title"]}"' for ind in index.tolist() ]
                        concat_titles = ' and '.join(titles)
                        input = train.append_to_title(ex, concat_titles)
                        inputs.append(input)
    
                    fewshotacc = (fewshot.predict(inputs).argmax(dim=1) == target).float().mean().item()

                fewloss = fewshot.learn(inputs, target) if istrain else None
                printer.addobs(iteration, fewloss, fewshotacc if istrain else None, fewshotacc if not istrain else None)

            printer.print()
            printer.autoprint = False

 
from Fork import SubProcess
for k in [0,4]:
    with SubProcess() as process: process.parent or peft_t5_baselines(k)

n                  iter       since      0 loss       since       0 acc       since 0 acc (dev)       since      dt (s)
1                     0           0       0.671       0.671         0.5         0.5           0           0        2.58
2                     0           0       0.642       0.613       0.625        0.75           0           0         4.4
4                     0           0        1.02        1.79       0.583         0.5        0.75        0.75        6.62
8                     0           0        1.01           1       0.429       0.312        0.75           0        13.3
16                    0           0       0.903       0.776       0.442       0.458       0.417        0.25        24.7
32                    0           0       0.803       0.702        0.51       0.577       0.583        0.75        48.6
64                    0           0       0.774       0.744         0.5        0.49         0.5       0.429        95.8
128                   0           0     

<class 'torch.cuda.OutOfMemoryError'>
CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.76 GiB total capacity; 12.26 GiB already allocated; 8.94 MiB free; 13.98 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
  File "/tmp/ipykernel_3916943/3252240670.py", line 77, in <module>
    with SubProcess() as process: process.parent or peft_t5_baselines(k)
  File "/tmp/ipykernel_3916943/3252240670.py", line 68, in peft_t5_baselines
    fewloss = fewshot.learn(inputs, target) if istrain else None
  File "/home/pmineiro/lampstuff/personalized_citation/MegaT5.py", line 46, in learn
    output = self(x)
  File "/home/pmineiro/miniconda3/envs/lampstuff/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/pmineiro/lampstuff/personalized_citation/MegaT5.py", li

In [3]:
def peft_t5_baselines(k):
    from MegaT5 import PeftT5Classifier
    from PersonalizedCitation import train_loader, dev_loader
    from ProgressPrinter import ProgressPrinter
    from peft import IA3Config, TaskType, prepare_model_for_kbit_training
    from transformers import T5ForConditionalGeneration
    import torch
    import warnings

    device = 'cuda'
    torch.set_default_device(device)
    torch.manual_seed(2112)

    train = train_loader(batch_size=2)
    dev = dev_loader(batch_size=2)

    def interleave(a, b):
        from math import inf
        
        atot, btot = a.num_examples, b.num_examples
        aiter, biter = a.__iter__(), b.__iter__()
        aelem, belem = next(aiter), next(biter)
        anum, bnum = 1, 1

        while anum != inf and bnum != inf:
            if anum * btot <= bnum * atot:
                yield (True, aelem)
                try:
                    aelem = next(aiter)
                    anum += 1
                except StopIteration:
                    anum = inf
            else:
                yield (False, belem)
                try:
                    belem = next(biter)
                    bnum += 1
                except StopIteration:
                    bnum = inf

    peft_config = IA3Config(task_type=TaskType.SEQ_2_SEQ_LM)
    t5 = prepare_model_for_kbit_training(T5ForConditionalGeneration.from_pretrained('google/flan-t5-xl', load_in_8bit=True))
    fewshot = PeftT5Classifier(train.num_labels, peft_config, t5=t5)

    with ProgressPrinter('iter', f'{k} loss', f'{k} acc', f'{k} acc (dev)') as printer, warnings.catch_warnings():
        warnings.filterwarnings("ignore", message=".*MatMul8bitLt.*")
        
        for iteration in range(2):
            for istrain, (examples, labels) in interleave(train, dev):
                with torch.no_grad():
                    inputs = []
                    target = torch.Tensor([ int(label == train.choices[1]) for label in labels ]).long().to(device)
    
                    for ex in examples:
                        embeddings = train.embed([ '\n\n'.join([ ex['ref1'], ex['ref2'] ]) ] + 
                                                 [ v['title'] 
                                                   for v in ex['profile']
                                                   if v['title'] != ex['title'] 
                                                 ])
                        index = torch.topk(embeddings[0,:] @ embeddings[1:,:].T, dim=0, k=k).indices.to('cpu')
                        titles = [ f'"{ex["profile"][ind]["title"]}"' for ind in index.tolist() ]
                        concat_titles = ' and '.join(titles)
                        input = train.append_to_title(ex, concat_titles)
                        inputs.append(input)
    
                    fewshotacc = (fewshot.predict(inputs).argmax(dim=1) == target).float().mean().item()

                fewloss = fewshot.learn(inputs, target) if istrain else None
                printer.addobs(iteration, fewloss, fewshotacc if istrain else None, fewshotacc if not istrain else None)

            printer.print()
            printer.autoprint = False

 
from Fork import SubProcess
for k in [4]:
    with SubProcess() as process: process.parent or peft_t5_baselines(k)

n                  iter       since      4 loss       since       4 acc       since 4 acc (dev)       since      dt (s)
1                     0           0       0.657       0.657         0.5         0.5           0           0        2.14
2                     0           0        2.39        4.13        0.25           0           0           0        3.68
4                     0           0         1.9       0.906       0.167           0           0           0        5.82
8                     0           0        1.23       0.735       0.357         0.5           0           0        11.9
16                    0           0        1.09        0.92       0.346       0.333         0.5        0.75        22.1
32                    0           0       0.907       0.726       0.423         0.5       0.583       0.667        44.1
64                    0           0       0.808       0.704       0.539        0.66       0.731       0.857        86.9
128                   0           0     

KeyboardInterrupt: 

10027             0.393           1       0.517       0.486       0.742        0.76       0.721       0.728    1.36e+04


<class 'KeyboardInterrupt'>

  File "/tmp/ipykernel_3916943/3896096721.py", line 77, in <module>
    with SubProcess() as process: process.parent or peft_t5_baselines(k)
  File "/tmp/ipykernel_3916943/3896096721.py", line 68, in peft_t5_baselines
    fewloss = fewshot.learn(inputs, target) if istrain else None
  File "/home/pmineiro/lampstuff/personalized_citation/MegaT5.py", line 48, in learn
    loss.backward()
  File "/home/pmineiro/miniconda3/envs/lampstuff/lib/python3.10/site-packages/torch/_tensor.py", line 478, in backward
    return handle_torch_function(
  File "/home/pmineiro/miniconda3/envs/lampstuff/lib/python3.10/site-packages/torch/overrides.py", line 1534, in handle_torch_function
    result = mode.__torch_function__(public_api, types, args, kwargs)
  File "/home/pmineiro/miniconda3/envs/lampstuff/lib/python3.10/site-packages/torch/utils/_device.py", line 62, in __torch_function__
    return func(*args, **kwargs)
  File "/home/pmineiro/miniconda3/envs/lampstuff/lib/pytho