In [1]:
# ! pip install -U accelerate
# ! pip install -U transformers

# import os
# os._exit(00)

In [2]:
# Connect to Google Drive and upload a folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import torch
import re # Regular expression
from tqdm import tqdm

from torch.utils.data import Dataset
from typing import List, Dict, Union
from transformers import pipeline
from transformers import Trainer, TrainingArguments, AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

from transformers import DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration

import gc

## Downloading Data

In [None]:
# !git clone https://github.com/s-nlp/semantic-role-labelling.git

In [2]:
path = '/content/drive/MyDrive/Colab Notebooks/NLP_project/train.tsv'
path_dev = '/content/drive/MyDrive/Colab Notebooks/NLP_project/dev.tsv'

In [3]:
df = pd.read_csv(path, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

In [4]:
df.head(22)

Unnamed: 0,data,label
0,also,O
1,",",O
2,i,O
3,have,O
4,recently,O
5,discovered,O
6,advil,B-Object
7,liquigels,O
8,work,O
9,much,O


In [5]:
df_dev.head()

Unnamed: 0,data,label
0,meanwhile,O
1,",",O
2,though,O
3,windows,B-Object
4,8,I-Object


In [6]:
df.shape, df_dev.shape

((63408, 2), (8646, 2))

In [7]:
df.data[592]

'"'

## Preprocessing

    - Separating data into sentences with empty lines (NaN).
    - Clean punctuation into single dot.


#### T5

[two, four | money]

In [8]:
'''
### Instruction: Find all aspects , objects and predicates .

### Input: in the content of search results , bing is not consistently superior to google .

### Response: [ search results | aspect ] [ bing , google | object ] [ not , superior | predicate ]
'''
### Response: [ search results | aspect ] [ bing | object ] [ not | predicate ] [ superior | predicate ] [ google | object ] not quite well

# Separating data into sentences with empty lines (NaN)

def separate_text(df):
    pattern = ["###", "Instruction:", "Find", "all", "aspects", ",", "objects", "and", "predicates", ".\n\n###", "Input:"]
    end_pattern = ["\n\n###", "Response:"]
    a_pattern = ['|','aspect',']']
    o_pattern = ['|','object',']']
    p_pattern = ['|','predicate',']']
    sep = ','

    input = [] # for input
    output = []
    sentence = []
    prev_tag = ''
    temp_a = False
    temp_o = False
    temp_p = False
    a = [] # aspects
    o = [] # objects
    p = [] # predicates

    for word, tag in df.values:
        if word == '_nan':
            input.append(pattern + sentence + end_pattern)
            if len(a) != 0 and a[-1] == sep: del a[-1]
            if len(o) != 0 and o[-1] == sep: del o[-1]
            if len(p) != 0 and p[-1] == sep: del p[-1]
            output.append(['['] + a + a_pattern + ['['] + o + o_pattern + ['['] + p + p_pattern)
            sentence = []
            a = []
            o = []
            p = []
            temp_a = False
            temp_o = False
            temp_p = False
            prev_tag = ''
        else:
            tag = tag.lower()
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ",", word)
            word = re.sub(r"[,]+", ",", word)
            word = re.sub(r"[.]+", ".", word)

            # If prev tag was the last one in a tag set
            if prev_tag.split('-')[-1] != tag.split('-')[-1] or 'B-' in tag:
                if temp_a: #tag.split('-')[-1] == 'aspect':
                        a.append(sep)
                        temp_a = False
                if temp_o: #tag.split('-')[-1] == 'object':
                        o.append(sep)
                        temp_o = False
                if temp_p: #tag.split('-')[-1] == 'predicate':
                        p.append(sep)
                        temp_p = False

            if 'O' not in tag:
                if tag.split('-')[-1] == 'aspect':
                        a.append(word)
                        temp_a = True
                if tag.split('-')[-1] == 'object':
                        o.append(word)
                        temp_o = True
                if tag.split('-')[-1] == 'predicate':
                        p.append(word)
                        temp_p = True

            prev_tag = tag
            sentence.append(word)

    return input, output

In [9]:
input, output = separate_text(df.iloc[:22])

print(input)
print(output)

[['###', 'Instruction:', 'Find', 'all', 'aspects', ',', 'objects', 'and', 'predicates', '.\n\n###', 'Input:', 'also', ',', 'i', 'have', 'recently', 'discovered', 'advil', 'liquigels', 'work', 'much', 'better', 'and', 'faster', 'for', 'a', 'headache', 'than', 'regular', 'ibuprofen', '.', '\n\n###', 'Response:']]
[['[', 'headache', '|', 'aspect', ']', '[', 'advil', ',', 'ibuprofen', '|', 'object', ']', '[', 'better', ',', 'faster', '|', 'predicate', ']']]


In [10]:
# Appling cleaning to df
input, output = separate_text(df)
input_dev, output_dev = separate_text(df_dev)

In [11]:
print(' '.join(input[-1]))
print(' '.join(output[-1]))

### Instruction: Find all aspects , objects and predicates .

### Input: in the content of search results , bing is not consistently superior to google . 

### Response:
[ search results | aspect ] [ bing , google | object ] [ not , superior | predicate ]


## Training model

In [None]:
### T5

MODEL_NAME = 't5-small'
IS_ENCODER_DECODER = True

# tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to('cuda')

In [None]:
class PairsDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}

        item['labels'] = self.y['input_ids'][idx]
        if IS_ENCODER_DECODER: item['decoder_attention_mask'] = self.y['attention_mask'][idx]

        return item

    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [None]:
class DataCollatorWithPadding:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        batch = self.tokenizer.pad(
            features,
            padding=True
        )

        if IS_ENCODER_DECODER:
            ybatch = self.tokenizer.pad(
                {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
                padding=True
            )
        else:
            ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels']},
            padding=True
            )

        batch['labels'] = ybatch['input_ids']

        if IS_ENCODER_DECODER: batch['decoder_attention_mask'] = ybatch['attention_mask']


        return {k: torch.tensor(v) for k, v in batch.items()}

In [None]:
max = 0
for o in input:
    if max < len(o):
        max = len(o)
max

114

In [None]:
### T5
MAX_LENGTH = 200

train_dataset = PairsDataset(tokenizer(input, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True),
                             tokenizer(output, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True))
dev_dataset = PairsDataset(tokenizer(input_dev, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True),
                           tokenizer(output_dev, padding='max_length', max_length=MAX_LENGTH, is_split_into_words=True))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

In [None]:
temp_f = train_dataset[0]

print(len(temp_f['attention_mask']), len(temp_f['decoder_attention_mask']))

z=0
for i, d, j, k, c, l in zip(temp_f['attention_mask'],
                            temp_f['decoder_attention_mask'],
                            temp_f['labels'],
                            temp_f['input_ids'],
                            tokenizer.convert_ids_to_tokens(temp_f['input_ids']),
                            tokenizer.convert_ids_to_tokens(temp_f['labels'])):

    z+=1
    if z == 40:
        break
    print(i, d, '\t', j, '\t', k, '\t', c, '\t', l)

200 200
1 1 	 784 	 1713 	 ▁# 	 ▁[
1 1 	 12085 	 30345 	 ## 	 ▁headache
1 1 	 1820 	 21035 	 ▁Instruction 	 ▁|
1 1 	 2663 	 10 	 : 	 ▁aspect
1 1 	 3 	 2588 	 ▁Find 	 ▁
1 1 	 908 	 66 	 ▁all 	 ]
1 1 	 784 	 3149 	 ▁aspects 	 ▁[
1 1 	 3 	 3 	 ▁ 	 ▁
1 1 	 9 	 6 	 , 	 a
1 1 	 26 	 4820 	 ▁objects 	 d
1 1 	 6372 	 11 	 ▁and 	 vil
1 1 	 3 	 554 	 ▁pre 	 ▁
1 1 	 6 	 11346 	 dica 	 ,
1 1 	 3 	 1422 	 tes 	 ▁
1 1 	 23 	 3 	 ▁ 	 i
1 1 	 3007 	 5 	 . 	 bu
1 1 	 1409 	 1713 	 ▁# 	 pro
1 1 	 89 	 30345 	 ## 	 f
1 1 	 35 	 86 	 ▁In 	 en
1 1 	 1820 	 2562 	 put 	 ▁|
1 1 	 3735 	 10 	 : 	 ▁object
1 1 	 3 	 92 	 ▁also 	 ▁
1 1 	 908 	 3 	 ▁ 	 ]
1 1 	 784 	 6 	 , 	 ▁[
1 1 	 394 	 3 	 ▁ 	 ▁better
1 1 	 3 	 23 	 i 	 ▁
1 1 	 6 	 43 	 ▁have 	 ,
1 1 	 3627 	 1310 	 ▁recently 	 ▁faster
1 1 	 1820 	 3883 	 ▁discovered 	 ▁|
1 1 	 554 	 3 	 ▁ 	 ▁pre
1 1 	 4370 	 9 	 a 	 dic
1 1 	 342 	 26 	 d 	 ate
1 1 	 3 	 6372 	 vil 	 ▁
1 1 	 908 	 3 	 ▁ 	 ]
1 1 	 1 	 40 	 l 	 </s>
1 0 	 0 	 23 	 i 	 <pad>
1 0 	 0 	 1169 	 qui

In [None]:
N_EPOCHS = 10
BATCH_SIZE = 16

args = TrainingArguments(output_dir="logs/model",
                         num_train_epochs=N_EPOCHS,
                         per_device_train_batch_size=BATCH_SIZE,
                         per_device_eval_batch_size=BATCH_SIZE,
                         save_steps=10000000,
                         logging_steps=200,
                        #  load_best_model_at_end = False,
                         evaluation_strategy = 'epoch',
                        #  optim='adamw_torch',
                        #  weight_decay=0.01,
                         )


trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = dev_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.070253
2,0.755900,0.0544
3,0.060000,0.049433
4,0.060000,0.047257
5,0.048700,0.04666
6,0.042400,0.045293
7,0.038800,0.044539
8,0.038800,0.044768
9,0.037600,0.044458
10,0.036000,0.044434


TrainOutput(global_step=1460, training_loss=0.1410922488121137, metrics={'train_runtime': 668.1389, 'train_samples_per_second': 34.933, 'train_steps_per_second': 2.185, 'total_flos': 1233936580608000.0, 'train_loss': 0.1410922488121137, 'epoch': 10.0})

In [None]:
name = 't5'

In [None]:
saved_name = '_'.join([name, str(N_EPOCHS)+'ep', str(BATCH_SIZE)+'b'])

dir = name+'/'+saved_name

trainer.save_model("/content/drive/MyDrive/Colab Notebooks/NLP_project/models/"+dir)

## Predicting

In [33]:
path_test = '/content/drive/MyDrive/Colab Notebooks/NLP_project/test_no_answers.tsv'
path_dev = '/content/drive/MyDrive/Colab Notebooks/NLP_project/dev.tsv'

df_test = pd.read_csv(path_test, sep='\t', header= None, names=['data'], quoting=3)

df_testo = pd.read_csv(path_test, sep='\t', header= None, names=['data'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'], quoting=3)

df_devo = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

In [34]:
df_devo.drop('labels', axis='columns', inplace=True)

In [35]:
df_test.shape, df_testo.shape

((9444, 1), (9804, 1))

In [36]:
df_dev.shape, df_devo.shape

((8363, 2), (8646, 1))

In [14]:
saved_model_name = 't5_10ep_16b'
dir = 't5/'+saved_model_name

model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP_project/models/"+dir).to('cuda')
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP_project/models/"+dir)

In [37]:
'''
### Instruction: Find all aspects , objects and predicates .

### Input: in the content of search results , bing is not consistently superior to google .

### Response:
'''

# Separating data into sentences with empty lines (NaN)

def separate_text_end(df):
    pattern = ["###", "Instruction:", "Find", "all", "aspects", ",", "objects", "and", "predicates", ".\n\n###", "Input:"]
    end_pattern = ["\n\n###", "Response:"]
    input = []
    sentence = []

    for word in df['data']:
        if word == '_nan':
            input.append(pattern + sentence + end_pattern)
            sentence = []
        else:
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ",", word)
            word = re.sub(r"[,]+", ",", word)
            word = re.sub(r"[.]+", ".", word)

            sentence.append(word)

    return input

In [38]:
nlp = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device='cuda')

In [39]:
sents = separate_text_end(df_devo)

# print(sents[0])
res = nlp(' '.join(sents[0]))[0]

print(res['generated_text'])

[ risk , compared | aspect ] [ windows , windows , windows 




In [None]:
sents = separate_text_end(df_devo)

for s in sents[:3]:
    input_ids = tokenizer.encode(s, return_tensors="pt", is_split_into_words=True)

    outputs = model.generate(input_ids.to("cuda"), no_repeat_ngram_size=2,
                             max_new_tokens=2048,
                             num_return_sequences=1, early_stopping=True)

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(s)
    print(decoded)
    # res = []
    # for d in decoded.split():
    #     if d.isdigit() and int(d) <= 6:
    #         res.append(ids_to_labels[d])
    #     else:
    #         res.append('O')

    # if len(res) != len(s):
    #     print('Nooo')
    #     for i in range(len(res), len(s)):
    #             res.append('O')

    # if len(res) != len(s):
    #     print('Nooo')
    # print(res)
    # print('--------')



['###', 'Instruction:', 'Find', 'all', 'aspects', ',', 'objects', 'and', 'predicates', '.\n\n###', 'Input:', 'meanwhile', ',', 'though', 'windows', '8', 'is', 'significantly', 'at', 'greater', 'risk', ',', '1', '.', '73', 'percent', ',', 'compared', 'to', 'windows', '8', '.', '1', ',', 'according', 'to', 'redmond', ',', 's', 'report', ',', 'it', ',', 's', 'still', 'significantly', 'safer', 'than', 'windows', '7', ',', 'windows', 'xp', ',', 'or', 'windows', 'vista', '.', '\n\n###', 'Response:']
[ risk, compared | aspect ] [ windows'8. 1 | object ], safer | predicate ].
['###', 'Instruction:', 'Find', 'all', 'aspects', ',', 'objects', 'and', 'predicates', '.\n\n###', 'Input:', 'windows', '7', 'is', 'still', 'going', 'strong', 'even', 'though', 'the', 'day', 'was', 'about', 'windows', '8', ',', 'microsoft', 'announced', 'it', ',', 's', 'approaching', '450', 'million', 'copies', 'of', 'windows', '7', 'sold', 'thus', 'far', ',', 'with', 'windows', '7', 'consumer', 'usage', 'coming', 'in', '

In [40]:
# MAX_LENGTH = 180

def evaluate(dfo):
    indexes_nan = []
    labels_list = []
    sents = separate_text_end(dfo)
    i = 0

    for sent in tqdm(sents):
        input_ids = tokenizer.encode(sent, return_tensors="pt", is_split_into_words=True)

        outputs = model.generate(input_ids.to("cuda"), no_repeat_ngram_size=6,
                                max_new_tokens=2048,
                                num_return_sequences=1, early_stopping=True)

        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if i < 8:
            i += 1
            print(len(decoded))
            print(len(sent))
            print(decoded)
            print('--------')
            print(labels_list)

        labels_list.append(decoded)

    return labels_list

In [41]:
model_name = 'dev'
# model_name = 'test'

In [42]:
### T5

if model_name == 'dev':
    labels_list = evaluate(df_devo)
else:
    labels_list = evaluate(df_testo)

  0%|          | 1/360 [00:00<03:10,  1.89it/s]

100
36
[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]
--------
[]


  1%|          | 2/360 [00:00<02:32,  2.34it/s]

61
28
[ | aspect ] [ android, ios | object ] [ better | predicate ]
--------
['[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]']


  1%|          | 3/360 [00:01<02:20,  2.53it/s]

72
44
[ supported | aspect ] [ ios, android | object ] [ earlier | predicate ]
--------
['[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]', '[ | aspect ] [ android, ios | object ] [ better | predicate ]']


  1%|          | 4/360 [00:01<02:47,  2.13it/s]

74
37
[ use | aspect ] [ android, ios | object ] [ better, simpler | predicate ]
--------
['[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]', '[ | aspect ] [ android, ios | object ] [ better | predicate ]', '[ supported | aspect ] [ ios, android | object ] [ earlier | predicate ]']


  1%|▏         | 5/360 [00:02<03:43,  1.59it/s]

91
49
[ functions | aspect ] [ bada, android, ios, wp7 | object ] [ simpler, faster | predicate ]
--------
['[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]', '[ | aspect ] [ android, ios | object ] [ better | predicate ]', '[ supported | aspect ] [ ios, android | object ] [ earlier | predicate ]', '[ use | aspect ] [ android, ios | object ] [ better, simpler | predicate ]']


  2%|▏         | 6/360 [00:03<03:41,  1.60it/s]

66
33
[ use | aspect ] [ ios, android | object ] [ simpler | predicate ]
--------
['[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]', '[ | aspect ] [ android, ios | object ] [ better | predicate ]', '[ supported | aspect ] [ ios, android | object ] [ earlier | predicate ]', '[ use | aspect ] [ android, ios | object ] [ better, simpler | predicate ]', '[ functions | aspect ] [ bada, android, ios, wp7 | object ] [ simpler, faster | predicate ]']


  2%|▏         | 7/360 [00:04<03:46,  1.56it/s]

66
34
[ use | aspect ] [ ios, android | object ] [ simpler | predicate ]
--------
['[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]', '[ | aspect ] [ android, ios | object ] [ better | predicate ]', '[ supported | aspect ] [ ios, android | object ] [ earlier | predicate ]', '[ use | aspect ] [ android, ios | object ] [ better, simpler | predicate ]', '[ functions | aspect ] [ bada, android, ios, wp7 | object ] [ simpler, faster | predicate ]', '[ use | aspect ] [ ios, android | object ] [ simpler | predicate ]']


  2%|▏         | 8/360 [00:04<03:52,  1.51it/s]

67
37
[ | aspect ] [ android, ios | object ] [ not inferior | predicate ]
--------
['[ developing | aspect ] [ android, ios, ioos, a laptop replacement | object ] [ faster | predicate ]', '[ | aspect ] [ android, ios | object ] [ better | predicate ]', '[ supported | aspect ] [ ios, android | object ] [ earlier | predicate ]', '[ use | aspect ] [ android, ios | object ] [ better, simpler | predicate ]', '[ functions | aspect ] [ bada, android, ios, wp7 | object ] [ simpler, faster | predicate ]', '[ use | aspect ] [ ios, android | object ] [ simpler | predicate ]', '[ use | aspect ] [ ios, android | object ] [ simpler | predicate ]']


100%|██████████| 360/360 [02:46<00:00,  2.16it/s]


In [43]:
labels_list[1]

'[ | aspect ] [ android, ios | object ] [ better | predicate ]'

In [44]:
from more_itertools import locate

In [45]:
labels = []
i = 0
sent = []
labeled_sent = [] # for output
tags = []
BIO = ['Aspect', 'Object', 'Predicate']
indexes_nan = [0]

# for d in tqdm(df_devo.data): # dev
for d in tqdm(df_testo.data): # test
    if d == '_nan':
        labels = labels_list[i].split('] [')

        # print(sent)
        # print(tags)
        # print(labels)

        for l in range(len(labels)):
            word_list = labels[l].split('|')[0].replace('[', '').strip().split(',')

            # print(l)
            # print(word_list)

            for j in word_list:
                j = j.strip()
                # print(j)
                if ' ' in j:
                    for beg, elem in enumerate(j.split()):
                        if beg == 0:
                            if elem in sent: tags[sent.index(elem)] = 'B-' + BIO[l]
                        else:
                            if elem in sent: tags[sent.index(elem)] = 'I-' + BIO[l]
                elif j in sent:
                    indices = locate(sent, lambda x: x == j)
                    for inde in indices:
                        tags[inde] = 'B-' + BIO[l]

        # print(tags)
        i += 1
        labeled_sent.extend(tags)
        indexes_nan.append(indexes_nan[-1] + len(tags))
        sent = []
        tags = []
    else:
        sent.append(d)
        tags.append('O')

100%|██████████| 9804/9804 [00:00<00:00, 686998.07it/s]


In [46]:
print(indexes_nan[:10])

[0, 23, 38, 69, 93, 129, 149, 170, 194, 207]


In [47]:
len(labeled_sent)

9444

In [48]:
if model_name == 'dev':
    df_dev['labels'] = labeled_sent # dev
else:
    df_test['labels'] = labeled_sent # test

In [49]:
if model_name == 'dev':
    df_dev.head(50)
else:
    df_test.head(50)

Unnamed: 0,data,labels
0,plus,O
1,",",O
2,android,B-Object
3,is,O
4,developing,B-Aspect
5,a,B-Object
6,way,O
7,faster,B-Predicate
8,than,O
9,ios,B-Object


## Saving Result Files

In [50]:
dir, model_name

('t5/t5_10ep_16b', 'test')

In [51]:
if model_name == 'dev':
    df_dev.to_csv('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'.tsv',
            header=None, index=False, quoting=3, sep='\t', encoding='utf-8')
else:
    df_test.to_csv('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'.tsv',
            header=None, index=False, quoting=3, sep='\t', encoding='utf-8')

In [52]:
with open('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'.tsv') as input:
    lines = [line for line in input if line.strip()]

with open('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'_post.tsv', 'w') as output:
    i = 0
    for line in lines:
        # if '_nan' in line:
        #     output.write("\n")
        # else:

        output.write(line)
        if i+1 in (indexes_nan):
            # print(line)
            output.write("\n")
        i += 1


print('A miracle happened ^-^/***')

A miracle happened ^-^/***
