In [None]:
# ! pip install -U accelerate
# ! pip install -U transformers

# import os
# os._exit(00)

In [None]:
# Connect to Google Drive and upload a folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
# import torch.optim as optim
import re # Regular expression
from tqdm import tqdm

from torch.utils.data import Dataset
from typing import List, Dict, Union
from transformers import pipeline
from transformers import Trainer, TrainingArguments, AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification#, BertForTokenClassification

import gc

## Downloading Data

In [None]:
# !git clone https://github.com/s-nlp/semantic-role-labelling.git

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/NLP_project/train.tsv'
path_dev = '/content/drive/MyDrive/Colab Notebooks/NLP_project/dev.tsv'

In [None]:
df = pd.read_csv(path, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

In [None]:
df.head(22)

Unnamed: 0,data,label
0,also,O
1,",",O
2,i,O
3,have,O
4,recently,O
5,discovered,O
6,advil,B-Object
7,liquigels,O
8,work,O
9,much,O


In [None]:
df_dev.head()

Unnamed: 0,data,label
0,meanwhile,O
1,",",O
2,though,O
3,windows,B-Object
4,8,I-Object


In [None]:
df.shape, df_dev.shape

((63408, 2), (8646, 2))

In [None]:
df.data[592]

'"'

## Preprocessing

    - Separating data into sentences with empty lines (NaN).
    - Clean punctuation into single dot.
    - Reconstruct labels into [0, 1, 2, 3, 4, 5, 6]

In [None]:
# Separating data into sentences with empty lines (NaN)

def separate_text(df):
    sents = []
    tags = []
    sentence = []
    label = []

    for word, tag in df.values:
        if word == '_nan':
            sents.append(sentence)
            tags.append(label)
            sentence = []
            label = []
        else:
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ".", word)
            word = re.sub(r"[.]+", ".", word)
            sentence.append(word)
            label.append(tag)
            # label.append(labels_to_ids[tag])

    return sents, tags

In [None]:
labels_to_ids = {k: v for v, k in enumerate(['O', 'B-Object', 'I-Object', 'B-Aspect', 'I-Aspect', 'B-Predicate', 'I-Predicate'])}
ids_to_labels = {v: k for v, k in enumerate(['O', 'B-Object', 'I-Object', 'B-Aspect', 'I-Aspect', 'B-Predicate', 'I-Predicate'])}

# Appling cleaning to df
sents, tags = separate_text(df)
sents_dev, tags_dev = separate_text(df_dev)

In [None]:
labels_to_ids

{'O': 0,
 'B-Object': 1,
 'I-Object': 2,
 'B-Aspect': 3,
 'I-Aspect': 4,
 'B-Predicate': 5,
 'I-Predicate': 6}

In [None]:
sents[0], tags[0]

(['also',
  '.',
  'i',
  'have',
  'recently',
  'discovered',
  'advil',
  'liquigels',
  'work',
  'much',
  'better',
  'and',
  'faster',
  'for',
  'a',
  'headache',
  'than',
  'regular',
  'ibuprofen',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Object',
  'O',
  'O',
  'O',
  'B-Predicate',
  'O',
  'B-Predicate',
  'O',
  'O',
  'B-Aspect',
  'O',
  'O',
  'B-Object',
  'O'])

In [None]:
sents_dev[0], tags_dev[0]

(['meanwhile',
  '.',
  'though',
  'windows',
  '8',
  'is',
  'significantly',
  'at',
  'greater',
  'risk',
  '.',
  '1',
  '.',
  '73',
  'percent',
  '.',
  'compared',
  'to',
  'windows',
  '8',
  '.',
  '1',
  '.',
  'according',
  'to',
  'redmond',
  '.',
  's',
  'report',
  '.',
  'it',
  '.',
  's',
  'still',
  'significantly',
  'safer',
  'than',
  'windows',
  '7',
  '.',
  'windows',
  'xp',
  '.',
  'or',
  'windows',
  'vista',
  '.'],
 ['O',
  'O',
  'O',
  'B-Object',
  'I-Object',
  'O',
  'O',
  'O',
  'B-Predicate',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Object',
  'I-Object',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-Predicate',
  'O',
  'B-Object',
  'I-Object',
  'O',
  'B-Object',
  'I-Object',
  'O',
  'O',
  'B-Object',
  'I-Object',
  'O'])

In [None]:
# MODEL_NAME = 'bert-base-cased'
MODEL_NAME = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=7)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def align_label(texts, labels, max_length=150, label_all_tokens=True):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=max_length, is_split_into_words=True)

    l = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(labels_to_ids[label[word_idx]])

            else:
                label_ids.append(labels_to_ids[label[word_idx]] if label_all_tokens else -100)

            previous_word_idx = word_idx
        l.append(label_ids)
    tokenized_inputs["labels"] = l

    return tokenized_inputs

In [None]:
align_label(sents[:1], tags[:1])

{'input_ids': [[101, 2036, 1012, 1045, 2031, 3728, 3603, 4748, 14762, 5622, 15549, 12439, 2015, 2147, 2172, 2488, 1998, 5514, 2005, 1037, 14978, 2084, 3180, 21307, 6279, 3217, 18940, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
class PairsDataset(Dataset):
    def __init__(self, x):
        self.y = x['labels']
        del x['labels']
        self.x = x

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}
        item['labels'] = self.y[idx]

        return item

    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [None]:
!pip install seqeval
!pip install datasets

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m648.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=491a756423d90f272123f419432dfeeea0ad93cf1906d4c55d88fa5200b7adc5
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl 

In [None]:
from datasets import load_metric

metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_preds):
    # print(eval_preds)

    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)
    # tokenizer.convert_ids_to_tokens(
    predictions = [
        [ids_to_labels[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [ids_to_labels[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
        }
    # return {"f1": results["overall_f1"]}

In [None]:
train_dataset = PairsDataset(align_label(sents,tags))
dev_dataset = PairsDataset(align_label(sents_dev,tags_dev))

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
temp_f = train_dataset[0]

z=0
for i, j, k, c in zip(temp_f['attention_mask'],
                    tokenizer.convert_ids_to_tokens(temp_f['input_ids']),
                    temp_f['labels'],
                    temp_f['input_ids']):

    z+=1
    if z == 40:
        break
    if k == -100:
        print(i, j, '\t\t -100 \t\t', c, k)
    else:
        print(i, j, '\t\t', ids_to_labels[k], '\t\t', c, k)

1 [CLS] 		 -100 		 101 -100
1 also 		 O 		 2036 0
1 . 		 O 		 1012 0
1 i 		 O 		 1045 0
1 have 		 O 		 2031 0
1 recently 		 O 		 3728 0
1 discovered 		 O 		 3603 0
1 ad 		 B-Object 		 4748 1
1 ##vil 		 B-Object 		 14762 1
1 li 		 O 		 5622 0
1 ##qui 		 O 		 15549 0
1 ##gel 		 O 		 12439 0
1 ##s 		 O 		 2015 0
1 work 		 O 		 2147 0
1 much 		 O 		 2172 0
1 better 		 B-Predicate 		 2488 5
1 and 		 O 		 1998 0
1 faster 		 B-Predicate 		 5514 5
1 for 		 O 		 2005 0
1 a 		 O 		 1037 0
1 headache 		 B-Aspect 		 14978 3
1 than 		 O 		 2084 0
1 regular 		 O 		 3180 0
1 ib 		 B-Object 		 21307 1
1 ##up 		 B-Object 		 6279 1
1 ##ro 		 B-Object 		 3217 1
1 ##fen 		 B-Object 		 18940 1
1 . 		 O 		 1012 0
1 [SEP] 		 -100 		 102 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100
0 [PAD] 		 -100 		 0 -100


In [None]:
N_EPOCHS = 2
BATCH_SIZE = 4

args = TrainingArguments(output_dir="logs/model",
                         num_train_epochs=N_EPOCHS,
                         per_device_train_batch_size=BATCH_SIZE,
                         per_device_eval_batch_size=BATCH_SIZE,
                         save_steps=10000000,
                         logging_steps=200,
                        #  load_best_model_at_end = False,
                         evaluation_strategy = 'epoch',
                        #  optim='adamw_torch',
                        #  weight_decay=0.01,
                         )


trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = dev_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1926,0.3375,0.68456,0.734628,0.708711,0.890807
2,0.099,0.374123,0.718837,0.671845,0.694547,0.893208


<transformers.trainer_utils.EvalPrediction object at 0x7eba7f3e7040>
<transformers.trainer_utils.EvalPrediction object at 0x7eba74337f10>


TrainOutput(global_step=1168, training_loss=0.17106517210398633, metrics={'train_runtime': 194.2658, 'train_samples_per_second': 24.029, 'train_steps_per_second': 6.012, 'total_flos': 357359882259600.0, 'train_loss': 0.17106517210398633, 'epoch': 2.0})

In [None]:
saved_name = 'bert_2ep_4b'

dir = 'bert/'+saved_name

model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP_project/models/"+dir)

In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP_project/models/"+dir+"_tok")

('/content/drive/MyDrive/Colab Notebooks/NLP_project/bert_2ep_4b_tok/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP_project/bert_2ep_4b_tok/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP_project/bert_2ep_4b_tok/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/NLP_project/bert_2ep_4b_tok/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/NLP_project/bert_2ep_4b_tok/tokenizer.json')

### Saving

In [None]:
path_test = '/content/drive/MyDrive/Colab Notebooks/NLP_project/test_no_answers.tsv'
path_dev = '/content/drive/MyDrive/Colab Notebooks/NLP_project/dev.tsv'

df_test = pd.read_csv(path_test, sep='\t', header= None, names=['data'], quoting=3)

df_testo = pd.read_csv(path_test, sep='\t', header= None, names=['data'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'], quoting=3)

df_devo = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

In [None]:
df_devo.drop('labels', axis='columns', inplace=True)

In [None]:
df_test.head()

Unnamed: 0,data
0,plus
1,","
2,android
3,is
4,developing


In [None]:
df_testo.head(25)

Unnamed: 0,data
0,plus
1,","
2,android
3,is
4,developing
5,a
6,way
7,faster
8,than
9,ios


In [None]:
df_dev.head()

Unnamed: 0,data,labels
0,meanwhile,O
1,",",O
2,though,O
3,windows,B-Object
4,8,I-Object


In [None]:
df_devo.head()

Unnamed: 0,data
0,meanwhile
1,","
2,though
3,windows
4,8


In [None]:
df_test.shape, df_testo.shape

((9444, 1), (9804, 1))

In [None]:
df_dev.shape, df_devo.shape

((8363, 2), (8646, 1))

In [None]:
saved_model_name = 'bert_2ep_4b'
# saved_model_name = saved_name

dir = 'bert/'+saved_model_name

model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP_project/models/"+dir)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP_project/models/"+dir+"_tok")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Separating data into sentences with empty lines (NaN)

def separate_text_end(df):
    sents = []
    sentence = []

    for word in df['data']:
        if word == '_nan':
            sents.append(sentence)
            sentence = []
        else:
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ".", word)
            word = re.sub(r"[.]+", ".", word)
            sentence.append(word)

    return sents

In [None]:
def evaluate(nlp, dfo, df):
    indexes_nan = []
    labels_list = []
    sents = separate_text_end(dfo)

    for sent in tqdm(sents):
        result = nlp(' '.join(sent))
        labels_list.extend([ids_to_labels[int(i['entity'][-1:])] for i in result if '#' not in i['word']])
        indexes_nan.append(len(labels_list))

    print(len(labels_list))
    df['labels'] = [labels_list[i] for i in range(df.shape[0])]

    return df, indexes_nan

In [None]:
# model_name = 'dev'
# df, indexes_nan = evaluate(nlp, df_devo, df_dev)

model_name = 'test'
df, indexes_nan = evaluate(nlp, df_testo, df_test)

100%|██████████| 360/360 [01:12<00:00,  4.97it/s]

9444





In [None]:
print(indexes_nan)

[23, 38, 69, 93, 129, 149, 170, 194, 207, 240, 274, 284, 295, 309, 329, 351, 373, 403, 439, 453, 466, 498, 516, 545, 565, 612, 635, 656, 667, 681, 695, 707, 723, 739, 760, 773, 788, 808, 830, 858, 874, 896, 933, 964, 1008, 1024, 1051, 1057, 1072, 1093, 1115, 1131, 1154, 1188, 1213, 1241, 1260, 1271, 1287, 1309, 1326, 1339, 1369, 1392, 1442, 1473, 1500, 1538, 1555, 1562, 1583, 1592, 1628, 1674, 1696, 1723, 1751, 1761, 1774, 1789, 1814, 1840, 1855, 1892, 1928, 1946, 1990, 2011, 2045, 2067, 2082, 2133, 2160, 2187, 2218, 2267, 2310, 2323, 2341, 2355, 2402, 2435, 2453, 2481, 2492, 2535, 2550, 2572, 2609, 2654, 2694, 2719, 2769, 2794, 2812, 2827, 2849, 2878, 2892, 2911, 2920, 2938, 2966, 2982, 3020, 3039, 3092, 3109, 3127, 3144, 3156, 3182, 3229, 3284, 3295, 3308, 3331, 3347, 3364, 3384, 3437, 3468, 3517, 3554, 3592, 3633, 3678, 3721, 3751, 3797, 3814, 3830, 3855, 3890, 3931, 3977, 4006, 4025, 4055, 4088, 4130, 4147, 4184, 4197, 4235, 4260, 4276, 4296, 4318, 4342, 4365, 4386, 4406, 4439, 448

In [None]:
df.head(50)

Unnamed: 0,data,labels
0,plus,O
1,",",O
2,android,B-Object
3,is,O
4,developing,B-Aspect
5,a,O
6,way,O
7,faster,B-Predicate
8,than,O
9,ios,B-Object


In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'.tsv',
          header=None, index=False, quoting=3, sep='\t', encoding='utf-8')

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'.tsv') as input:
    lines = [line for line in input if line.strip()]

with open('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'_post.tsv', 'w') as output:
    i = 0
    for line in lines:
        output.write(line)
        if i+1 in (indexes_nan):
            # print(line)
            output.write("\n")
        i += 1


print('A miracle happened ^-^/***')

A miracle happened ^-^/***
