In [2]:
# ! pip install -U accelerate
# ! pip install -U transformers

# import os
# os._exit(00)

In [3]:
# # Connect to Google Drive and upload a folder
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import pandas as pd
# import numpy as np
import torch
import re # Regular expression
import os
import wandb
from tqdm import tqdm

from torch.utils.data import Dataset
from typing import List, Dict, Union
from transformers import pipeline
from transformers import Trainer, TrainingArguments, AutoTokenizer
# from transformers import DataCollatorForTokenClassification
# from transformers import AutoModelForTokenClassification

# from transformers import DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration

# import gc

  from .autonotebook import tqdm as notebook_tqdm


## Downloading Data

In [5]:
# !git clone https://github.com/s-nlp/semantic-role-labelling.git

In [6]:
path = '/home/anastasiia.demidova/srl/srl_transformers/dataset/train.tsv'
path_dev = '/home/anastasiia.demidova/srl/srl_transformers/dataset/dev.tsv'
# path = '/content/drive/MyDrive/Colab Notebooks/NLP_project/train.tsv'
# path_dev = '/content/drive/MyDrive/Colab Notebooks/NLP_project/dev.tsv'

In [7]:
df = pd.read_csv(path, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

In [8]:
df.head(22)

Unnamed: 0,data,label
0,also,O
1,",",O
2,i,O
3,have,O
4,recently,O
5,discovered,O
6,advil,B-Object
7,liquigels,O
8,work,O
9,much,O


In [9]:
df_dev.head()

Unnamed: 0,data,label
0,meanwhile,O
1,",",O
2,though,O
3,windows,B-Object
4,8,I-Object


In [10]:
df.shape, df_dev.shape

((63408, 2), (8646, 2))

In [11]:
df.data[592]

'"'

## Preprocessing

    - Separating data into sentences with empty lines (NaN).
    - Clean punctuation into single dot.


#### T5

[two, four | money]

In [12]:
'''
[ search results | aspect ] [ bing , google | object ] [ not , superior | predicate ]
'''
# [ search results | aspect ] [ bing | object ] [ not | predicate ] [ superior | predicate ] [ google | object ] not quite well

BIO = ['aspect', 'object', 'predicate']

[two, four | OBJ]

In [13]:
# '''
# ### Instruction: Find all aspects , objects and predicates .

# ### Input: in the content of search results , bing is not consistently superior to google .

# ### Response: [ search results | ASP ] [ bing , google | OBJ ] [ not , superior | PRE ]
# '''

# BIO = ['ASP', 'OBJ', 'PRE']

[two, four | 1]

In [14]:
# '''
# ### Instruction: Find all aspects , objects and predicates .

# ### Input: in the content of search results , bing is not consistently superior to google .

# ### Response: [ search results | 0 ] [ bing , google | 1 ] [ not , superior | 2 ]
# '''

# BIO = ['0', '1', '2']

In [15]:
# '''
# ### Instruction: Find all aspects , objects and predicates .

# ### Input: _input_

# ### Response:
# '''

# pattern = ["###", "Instruction:", "Find", "all", "aspects", ",", "objects", "and", "predicates", ".\n\n###", "Input:"]
# end_pattern = ["\n\n###", "Response:"]


In [16]:
'''
Sentence: in the content of search results , bing is not consistently superior to google . 

aspect , object and predicate :
'''

PATTERN = ["Sentence:"]
END_PATTERN = ["\n\naspect", ",", "object", "and", "predicate", ":"]

In [17]:
# Separating data into sentences with empty lines (NaN)

def separate_text(df):
    a_pattern = ['|', BIO[0],']']
    o_pattern = ['|', BIO[1],']']
    p_pattern = ['|', BIO[2],']']
    sep = ','

    input = [] # for input
    output = []
    sentence = []
    prev_tag = ''
    temp_a = False
    temp_o = False
    temp_p = False
    a = [] # aspects
    o = [] # objects
    p = [] # predicates

    for word, tag in df.values:
        if word == '_nan':
            input.append(PATTERN + sentence + END_PATTERN)
            if len(a) != 0 and a[-1] == sep: del a[-1]
            if len(o) != 0 and o[-1] == sep: del o[-1]
            if len(p) != 0 and p[-1] == sep: del p[-1]
            output.append(['['] + a + a_pattern + ['['] + o + o_pattern + ['['] + p + p_pattern)
            sentence = []
            a = []
            o = []
            p = []
            temp_a = False
            temp_o = False
            temp_p = False
            prev_tag = ''
        else:
            tag = tag.lower()
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ",", word)
            word = re.sub(r"[,]+", ",", word)
            word = re.sub(r"[.]+", ".", word)

            # If prev tag was the last one in a tag set
            if prev_tag.split('-')[-1] != tag.split('-')[-1] or 'B-' in tag:
                if temp_a: #tag.split('-')[-1] == 'aspect':
                        a.append(sep)
                        temp_a = False
                if temp_o: #tag.split('-')[-1] == 'object':
                        o.append(sep)
                        temp_o = False
                if temp_p: #tag.split('-')[-1] == 'predicate':
                        p.append(sep)
                        temp_p = False

            if 'O' not in tag:
                if tag.split('-')[-1] == 'aspect':
                        a.append(word)
                        temp_a = True
                if tag.split('-')[-1] == 'object':
                        o.append(word)
                        temp_o = True
                if tag.split('-')[-1] == 'predicate':
                        p.append(word)
                        temp_p = True

            prev_tag = tag
            sentence.append(word)

    return input, output

In [18]:
input, output = separate_text(df.iloc[:22])

print(input)
print(output)

[['Sentence:', 'also', ',', 'i', 'have', 'recently', 'discovered', 'advil', 'liquigels', 'work', 'much', 'better', 'and', 'faster', 'for', 'a', 'headache', 'than', 'regular', 'ibuprofen', '.', '\n\naspect', ',', 'object', 'and', 'predicate', ':']]
[['[', 'headache', '|', 'aspect', ']', '[', 'advil', ',', 'ibuprofen', '|', 'object', ']', '[', 'better', ',', 'faster', '|', 'predicate', ']']]


In [19]:
# Appling cleaning to df
input, output = separate_text(df)
input_dev, output_dev = separate_text(df_dev)

In [20]:
print(' '.join(input[-1]))
print(' '.join(output[-1]))

Sentence: in the content of search results , bing is not consistently superior to google . 

aspect , object and predicate :
[ search results | aspect ] [ bing , google | object ] [ not , superior | predicate ]


## Training model

In [21]:
### T5 # 6ep_8b

MODEL_NAME = 't5-large'
IS_ENCODER_DECODER = True

# tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to('cuda')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [22]:
# ### FLAN-T5 # 4ep_8b

# MODEL_NAME = 'flan-t5'
# MODEL_HF_NAME = 'google/flan-t5-large'
# IS_ENCODER_DECODER = True

# # tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
# tokenizer = AutoTokenizer.from_pretrained(MODEL_HF_NAME)

# model = T5ForConditionalGeneration.from_pretrained(MODEL_HF_NAME).to('cuda')

# tokenizer.eos_token = '</s>'

In [23]:
class PairsDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}

        item['labels'] = self.y['input_ids'][idx]
        if IS_ENCODER_DECODER: item['decoder_attention_mask'] = self.y['attention_mask'][idx]

        return item

    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [24]:
class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True
        )

        if IS_ENCODER_DECODER:
            ybatch = self.tokenizer.pad(
                {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
                padding=True
            )
        else:
            ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels']},
            padding=True
            )

        batch['labels'] = ybatch['input_ids']

        if IS_ENCODER_DECODER: batch['decoder_attention_mask'] = ybatch['attention_mask']


        return {k: torch.tensor(v) for k, v in batch.items()}

In [25]:
max = 0
for o in input:
    if max < len(o):
        max = len(o)
max

108

In [26]:
### T5
MAX_LENGTH = 128

train_dataset = PairsDataset(tokenizer(input, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True),
                             tokenizer(output, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True))
dev_dataset = PairsDataset(tokenizer(input_dev, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True),
                           tokenizer(output_dev, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [27]:
temp_f = train_dataset[0]

print(len(temp_f['attention_mask']), len(temp_f['decoder_attention_mask']))

z=0
for i, d, j, k, c, l in zip(temp_f['attention_mask'],
                            temp_f['decoder_attention_mask'],
                            temp_f['labels'],
                            temp_f['input_ids'],
                            tokenizer.convert_ids_to_tokens(temp_f['input_ids']),
                            tokenizer.convert_ids_to_tokens(temp_f['labels'])):

    z+=1
    if z == 40:
        break
    print(i, d, '\t', j, '\t', k, '\t', c, '\t', l)

128 128
1 1 	 784 	 4892 	 ▁Sen 	 ▁[
1 1 	 12085 	 17 	 t 	 ▁headache
1 1 	 1820 	 1433 	 ence 	 ▁|
1 1 	 2663 	 10 	 : 	 ▁aspect
1 1 	 3 	 92 	 ▁also 	 ▁
1 1 	 908 	 3 	 ▁ 	 ]
1 1 	 784 	 6 	 , 	 ▁[
1 1 	 3 	 3 	 ▁ 	 ▁
1 1 	 9 	 23 	 i 	 a
1 1 	 26 	 43 	 ▁have 	 d
1 1 	 6372 	 1310 	 ▁recently 	 vil
1 1 	 3 	 3883 	 ▁discovered 	 ▁
1 1 	 6 	 3 	 ▁ 	 ,
1 1 	 3 	 9 	 a 	 ▁
1 1 	 23 	 26 	 d 	 i
1 1 	 3007 	 6372 	 vil 	 bu
1 1 	 1409 	 3 	 ▁ 	 pro
1 1 	 89 	 40 	 l 	 f
1 1 	 35 	 23 	 i 	 en
1 1 	 1820 	 1169 	 qui 	 ▁|
1 1 	 3735 	 1803 	 gel 	 ▁object
1 1 	 3 	 7 	 s 	 ▁
1 1 	 908 	 161 	 ▁work 	 ]
1 1 	 784 	 231 	 ▁much 	 ▁[
1 1 	 394 	 394 	 ▁better 	 ▁better
1 1 	 3 	 11 	 ▁and 	 ▁
1 1 	 6 	 3627 	 ▁faster 	 ,
1 1 	 3627 	 21 	 ▁for 	 ▁faster
1 1 	 1820 	 3 	 ▁ 	 ▁|
1 1 	 554 	 9 	 a 	 ▁pre
1 1 	 4370 	 12085 	 ▁headache 	 dic
1 1 	 342 	 145 	 ▁than 	 ate
1 1 	 3 	 1646 	 ▁regular 	 ▁
1 1 	 908 	 3 	 ▁ 	 ]
1 1 	 1 	 23 	 i 	 </s>
1 0 	 0 	 3007 	 bu 	 <pad>
1 0 	 0 	 1409 	 pro 

In [28]:
os.environ["WANDB_PROJECT"] = "<nlp-project>" # name your W&B project

In [29]:
# run = wandb.init(
#     project="nlp-project",
#     notes="", tags=["t5", "project"], resume=True
# )
# os.environ["WANDB_RESUME"] = "allow"

In [30]:
N_EPOCHS = 6
BATCH_SIZE = 8
run_name = f'{MODEL_NAME}_{N_EPOCHS}ep_{BATCH_SIZE}b'

args = TrainingArguments(output_dir="logs/model",
                         num_train_epochs=N_EPOCHS,
                         per_device_train_batch_size=BATCH_SIZE,
                         per_device_eval_batch_size=BATCH_SIZE,
                         save_steps=10000000,
                         logging_steps=200,
                         report_to="wandb",  # enable logging to W&B
                         run_name=run_name,  # name of the W&B run (optional)
                         #  load_best_model_at_end = False,
                         evaluation_strategy = 'epoch',
                         #  optim='adamw_torch',
                         #  weight_decay=0.01,
                         )


trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = dev_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
    # compute_metrics=compute_metrics
)

In [31]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mprofii[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1752 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 11%|█▏        | 200/1752 [02:18<18:09,  1.43it/s]

{'loss': 0.3591, 'learning_rate': 4.4292237442922375e-05, 'epoch': 0.68}


                                                  
 17%|█▋        | 292/1752 [03:30<16:12,  1.50it/s]

{'eval_loss': 0.053233131766319275, 'eval_runtime': 7.0586, 'eval_samples_per_second': 40.093, 'eval_steps_per_second': 5.1, 'epoch': 1.0}


 23%|██▎       | 400/1752 [04:47<15:47,  1.43it/s]  

{'loss': 0.0415, 'learning_rate': 3.8584474885844754e-05, 'epoch': 1.37}


                                                  
 33%|███▎      | 584/1752 [07:03<12:54,  1.51it/s]

{'eval_loss': 0.048347581177949905, 'eval_runtime': 6.998, 'eval_samples_per_second': 40.44, 'eval_steps_per_second': 5.144, 'epoch': 2.0}


 34%|███▍      | 600/1752 [07:14<13:31,  1.42it/s]

{'loss': 0.0318, 'learning_rate': 3.287671232876712e-05, 'epoch': 2.05}


 46%|████▌     | 800/1752 [09:35<11:05,  1.43it/s]

{'loss': 0.0241, 'learning_rate': 2.71689497716895e-05, 'epoch': 2.74}


                                                  
 50%|█████     | 876/1752 [10:35<09:48,  1.49it/s]

{'eval_loss': 0.04894203320145607, 'eval_runtime': 6.9525, 'eval_samples_per_second': 40.705, 'eval_steps_per_second': 5.178, 'epoch': 3.0}


 57%|█████▋    | 1000/1752 [12:02<08:42,  1.44it/s]

{'loss': 0.0205, 'learning_rate': 2.1461187214611872e-05, 'epoch': 3.42}


                                                   
 67%|██████▋   | 1168/1752 [14:08<06:22,  1.53it/s]

{'eval_loss': 0.05329310521483421, 'eval_runtime': 6.9791, 'eval_samples_per_second': 40.549, 'eval_steps_per_second': 5.158, 'epoch': 4.0}


 68%|██████▊   | 1200/1752 [14:30<06:28,  1.42it/s]

{'loss': 0.0172, 'learning_rate': 1.5753424657534248e-05, 'epoch': 4.11}


 80%|███████▉  | 1400/1752 [16:51<04:07,  1.42it/s]

{'loss': 0.0148, 'learning_rate': 1.004566210045662e-05, 'epoch': 4.79}


                                                   
 83%|████████▎ | 1460/1752 [17:40<03:12,  1.52it/s]

{'eval_loss': 0.05591963976621628, 'eval_runtime': 6.884, 'eval_samples_per_second': 41.11, 'eval_steps_per_second': 5.23, 'epoch': 5.0}


 91%|█████████▏| 1600/1752 [19:18<01:46,  1.43it/s]

{'loss': 0.0125, 'learning_rate': 4.337899543378996e-06, 'epoch': 5.48}


                                                   
100%|██████████| 1752/1752 [21:11<00:00,  1.38it/s]

{'eval_loss': 0.05626462772488594, 'eval_runtime': 6.9718, 'eval_samples_per_second': 40.592, 'eval_steps_per_second': 5.164, 'epoch': 6.0}
{'train_runtime': 1280.6242, 'train_samples_per_second': 10.935, 'train_steps_per_second': 1.368, 'train_loss': 0.060590083332366595, 'epoch': 6.0}





TrainOutput(global_step=1752, training_loss=0.060590083332366595, metrics={'train_runtime': 1280.6242, 'train_samples_per_second': 10.935, 'train_steps_per_second': 1.368, 'train_loss': 0.060590083332366595, 'epoch': 6.0})

In [32]:
# name = 't5'

saved_name = '_'.join([MODEL_NAME, str(N_EPOCHS)+'ep', str(BATCH_SIZE)+'b'])

dir = MODEL_NAME+'/'+saved_name

trainer.save_model("/home/anastasiia.demidova/srl/srl_transformers/models/"+dir)

In [33]:
wandb.finish()



0,1
eval/loss,▅▁▂▅██
eval/runtime,█▆▄▅▁▅
eval/samples_per_second,▁▃▅▄█▄
eval/steps_per_second,▁▃▅▄█▄
train/epoch,▁▁▂▃▃▄▄▅▅▆▆▇▇██
train/global_step,▁▁▂▃▃▄▄▅▅▆▆▇▇██
train/learning_rate,█▇▆▅▄▃▂▁
train/loss,█▂▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.05626
eval/runtime,6.9718
eval/samples_per_second,40.592
eval/steps_per_second,5.164
train/epoch,6.0
train/global_step,1752.0
train/learning_rate,0.0
train/loss,0.0125
train/total_flos,7592211947520000.0
train/train_loss,0.06059


## Predicting

In [2]:
path_test = '/home/anastasiia.demidova/srl/srl_transformers/dataset/test_no_answers.tsv'
path_dev = '/home/anastasiia.demidova/srl/srl_transformers/dataset/dev.tsv'

df_test = pd.read_csv(path_test, sep='\t', header= None, names=['data'], quoting=3)

df_testo = pd.read_csv(path_test, sep='\t', header= None, names=['data'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'], quoting=3)

df_devo = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

In [3]:
df_devo.drop('labels', axis='columns', inplace=True)

In [4]:
df_test.shape, df_testo.shape

((9444, 1), (9804, 1))

In [5]:
df_dev.shape, df_devo.shape

((8363, 2), (8646, 1))

In [6]:
# dir = MODEL_NAME + '/'+ saved_name
# MODEL_NAME, saved_name

In [7]:
saved_model_name = 't5-large_6ep_8b'
dir = 't5-large/'+saved_model_name

# saved_model_name = 'flan-t5_10ep_8b'
# dir = 'flan-t5/'+saved_model_name

model = T5ForConditionalGeneration.from_pretrained("/home/anastasiia.demidova/srl/srl_transformers/models/"+dir).to('cuda')
tokenizer = AutoTokenizer.from_pretrained("/home/anastasiia.demidova/srl/srl_transformers/models/"+dir)

In [8]:
# Separating data into sentences with empty lines (NaN)

def separate_text_end(df):
    # pattern = ["###", "Instruction:", "Find", "all", "aspects", ",", "objects", "and", "predicates", ".\n\n###", "Input:"]
    # end_pattern = ["\n\n###", "Response:"]
    PATTERN = ["Sentence:"]
    END_PATTERN = ["\n\naspect", ",", "object", "and", "predicate", ":"]
    input = []
    sentence = []

    for word in df['data']:
        if word == '_nan':
            input.append(PATTERN + sentence + END_PATTERN)
            sentence = []
        else:
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ",", word)
            word = re.sub(r"[,]+", ",", word)
            word = re.sub(r"[.]+", ".", word)

            sentence.append(word)

    return input

In [9]:
len(separate_text_end(df_devo))

283

In [9]:
# nlp = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device='cuda')

In [10]:
# sents = separate_text_end(df_devo)

# # print(sents[0])
# res = nlp(' '.join(sents[0]))[0]

# print(res['generated_text'])

In [10]:
sents = separate_text_end(df_devo)

for s in sents[:3]:
    input_ids = tokenizer.encode(s, return_tensors="pt", is_split_into_words=True)

    outputs = model.generate(input_ids.to("cuda"), no_repeat_ngram_size=6,
                             max_new_tokens=2048,
                             num_return_sequences=1, early_stopping=True)

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(s)
    print(decoded)
    # res = []
    # for d in decoded.split():
    #     if d.isdigit() and int(d) <= 6:
    #         res.append(ids_to_labels[d])
    #     else:
    #         res.append('O')

    # if len(res) != len(s):
    #     print('Nooo')
    #     for i in range(len(res), len(s)):
    #             res.append('O')

    # if len(res) != len(s):
    #     print('Nooo')
    # print(res)
    # print('--------')



['Sentence:', 'meanwhile', ',', 'though', 'windows', '8', 'is', 'significantly', 'at', 'greater', 'risk', ',', '1', '.', '73', 'percent', ',', 'compared', 'to', 'windows', '8', '.', '1', ',', 'according', 'to', 'redmond', ',', 's', 'report', ',', 'it', ',', 's', 'still', 'significantly', 'safer', 'than', 'windows', '7', ',', 'windows', 'xp', ',', 'or', 'windows', 'vista', '.', '\n\naspect', ',', 'object', 'and', 'predicate', ':']
[ risk, safer | aspect ] [ windows 8, windows 8, windows 7, windows xp, windows vista | object ] [ greater, safer | predicate ]
['Sentence:', 'windows', '7', 'is', 'still', 'going', 'strong', 'even', 'though', 'the', 'day', 'was', 'about', 'windows', '8', ',', 'microsoft', 'announced', 'it', ',', 's', 'approaching', '450', 'million', 'copies', 'of', 'windows', '7', 'sold', 'thus', 'far', ',', 'with', 'windows', '7', 'consumer', 'usage', 'coming', 'in', 'greater', 'than', 'windows', 'xp', '.', '\n\naspect', ',', 'object', 'and', 'predicate', ':']
[ consumer usa

In [11]:
# MAX_LENGTH = 180

def evaluate(dfo):
    # indexes_nan = []
    labels_list = []
    sents = separate_text_end(dfo)
    i = 0

    for sent in tqdm(sents):
        input_ids = tokenizer.encode(sent, return_tensors="pt", is_split_into_words=True)

        outputs = model.generate(input_ids.to("cuda"), no_repeat_ngram_size=6,
                                max_new_tokens=2048,
                                num_return_sequences=1, early_stopping=True)

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if i < 8:
            i += 1
            print(len(decoded))
            # print(len(sent))
            print(decoded)
            print('--------')
            # print(labels_list)

        labels_list.append(decoded)

    return labels_list

In [12]:
model_name = 'dev'
# model_name = 'test'

In [13]:
if model_name == 'dev':
    labels_list = evaluate(df_devo)
else:
    labels_list = evaluate(df_testo)

  0%|          | 1/283 [00:01<05:35,  1.19s/it]

127
[ risk, safer | aspect ] [ windows 8, windows 8, windows 7, windows xp, windows vista | object ] [ greater, safer | predicate ]
--------


  1%|          | 2/283 [00:01<04:04,  1.15it/s]

90
[ consumer usage | aspect ] [ windows, windows, windows | object ] [ greater | predicate ]
--------


  1%|          | 3/283 [00:02<03:32,  1.32it/s]

69
[ | aspect ] [ windows, windows xp | object ] [ lighter | predicate ]
--------


  1%|▏         | 4/283 [00:03<03:36,  1.29it/s]

101
[ resources | aspect ] [ windows 8, windows 7, windows xp | object ] [ lighter, heavier | predicate ]
--------


  2%|▏         | 5/283 [00:04<03:39,  1.26it/s]

83
[ choosing | aspect ] [ windows, windows, xp, xp | object ] [ simpler | predicate ]
--------


  2%|▏         | 6/283 [00:04<03:38,  1.27it/s]

101
[ upgrading | aspect ] [ windows, windows, windows | object ] [ simpler, less expensive | predicate ]
--------


  2%|▏         | 7/283 [00:05<03:34,  1.28it/s]

78
[ | aspect ] [ windows, windows xp, windows 7 | object ] [ safer | predicate ]
--------


  3%|▎         | 8/283 [00:06<03:28,  1.32it/s]

96
[ virus infection | aspect ] [ windows, windows, windows | object ] [ safer, safer | predicate ]
--------


100%|██████████| 283/283 [03:45<00:00,  1.25it/s]


In [14]:
labels_list[1]

'[ consumer usage | aspect ] [ windows, windows, windows | object ] [ greater | predicate ]'

In [15]:
from more_itertools import locate

In [16]:
labels = []
i = 0
sent = []
labeled_sent = [] # for output
tags = []
BIO = ['Aspect', 'Object', 'Predicate']
indexes_nan = [0]

if model_name == 'dev':
    df = df_devo
else:
    df = df_testo

for d in tqdm(df.data):
    if d == '_nan':
        labels = labels_list[i].split('] [')

        # print(sent)
        # print(tags)
        # print(labels)

        for l in range(len(labels)):
            word_list = labels[l].split('|')[0].replace('[', '').strip().split(',')

            # print(l)
            # print(word_list)

            for j in word_list:
                j = j.strip()
                # print(j)
                if ' ' in j:
                    for beg, elem in enumerate(j.split()):
                        if beg == 0:
                            if elem in sent: tags[sent.index(elem)] = 'B-' + BIO[l]
                        else:
                            if elem in sent: tags[sent.index(elem)] = 'I-' + BIO[l]
                elif j in sent:
                    indices = locate(sent, lambda x: x == j)
                    for inde in indices:
                        tags[inde] = 'B-' + BIO[l]

        # print(tags)
        i += 1
        labeled_sent.extend(tags)
        indexes_nan.append(indexes_nan[-1] + len(tags))
        sent = []
        tags = []
    else:
        sent.append(d)
        tags.append('O')

100%|██████████| 8646/8646 [00:00<00:00, 982709.67it/s]


In [17]:
print(indexes_nan[:10])

[0, 47, 90, 104, 127, 170, 206, 223, 249, 284]


In [18]:
len(labeled_sent)

8363

In [19]:
if model_name == 'dev':
    df_dev['labels'] = labeled_sent # dev
else:
    df_test['labels'] = labeled_sent # test

In [20]:
if model_name == 'dev':
    print(df_dev.head(50))
else:
    print(df_test.head(50))

             data       labels
0       meanwhile            O
1               ,            O
2          though            O
3         windows     B-Object
4               8     I-Object
5              is            O
6   significantly            O
7              at            O
8         greater  B-Predicate
9            risk     B-Aspect
10              (            O
11              1            O
12              .            O
13             73            O
14        percent            O
15              )            O
16       compared            O
17             to            O
18        windows            O
19              8            O
20              .            O
21              1            O
22              ,            O
23      according            O
24             to            O
25        redmond            O
26              '            O
27              s            O
28         report            O
29              ,            O
30             it            O
31      

## Saving Result Files

In [21]:
dir, model_name

('t5-large/t5-large_6ep_8b_aspect:', 'dev')

In [22]:
if model_name == 'dev':
    df_dev.to_csv('/home/anastasiia.demidova/srl/srl_transformers/results/'+dir+'_'+model_name+'.tsv',
            header=None, index=False, quoting=3, sep='\t', encoding='utf-8')
else:
    df_test.to_csv('/home/anastasiia.demidova/srl/srl_transformers/results/'+dir+'_'+model_name+'.tsv',
            header=None, index=False, quoting=3, sep='\t', encoding='utf-8')

In [23]:
with open('/home/anastasiia.demidova/srl/srl_transformers/results/'+dir+'_'+model_name+'.tsv') as input:
    lines = [line for line in input if line.strip()]

with open('/home/anastasiia.demidova/srl/srl_transformers/results/'+dir+'_'+model_name+'_post.tsv', 'w') as output:
    i = 0
    for line in lines:
        # if '_nan' in line:
        #     output.write("\n")
        # else:

        output.write(line)
        if i+1 in (indexes_nan):
            # print(line)
            output.write("\n")
        i += 1


print('A miracle happened ^-^/***')

A miracle happened ^-^/***
