In [None]:
! pip install -U accelerate
! pip install -U transformers

import os
os._exit(00)

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting transformers
  Downloading transformers-4.36.1-py3-none-any.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.1


In [38]:
# Connect to Google Drive and upload a folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import torch
# import torch.optim as optim
import re # Regular expression
from tqdm import tqdm

from torch.utils.data import Dataset
from typing import List, Dict, Union
from transformers import pipeline
from transformers import Trainer, TrainingArguments, AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification#, BertForTokenClassification

import gc

In [3]:
# MODEL_NAME = 'bert-base-cased'
MODEL_NAME = 'gpt2'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
tokenizer.add_special_tokens({'pad_token': '-100 '})

model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=7)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Downloading Data

In [4]:
# !git clone https://github.com/s-nlp/semantic-role-labelling.git

In [5]:
path_train = 'https://raw.githubusercontent.com/profii/srl_transformers/main/dataset/train.tsv'
path_dev = 'https://raw.githubusercontent.com/profii/srl_transformers/main/dataset/dev.tsv'

In [6]:
df = pd.read_csv(path_train, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'label'],
                 quoting=3, skip_blank_lines=False).fillna('_nan')

In [7]:
df

Unnamed: 0,data,label
0,also,O
1,",",O
2,i,O
3,have,O
4,recently,O
...,...,...
63403,superior,B-Predicate
63404,to,O
63405,google,B-Object
63406,.,O


In [8]:
df.head(22)

Unnamed: 0,data,label
0,also,O
1,",",O
2,i,O
3,have,O
4,recently,O
5,discovered,O
6,advil,B-Object
7,liquigels,O
8,work,O
9,much,O


In [9]:
df.shape, df_dev.shape

((63408, 2), (8646, 2))

In [10]:
df.data[592]

'"'

## Preprocessing

    - Separating data into sentences with empty lines (NaN).
    - Clean punctuation into single dot.
    - Reconstruct labels into [0, 1, 2, 3, 4, 5, 6]

In [11]:
# Separating data into sentences with empty lines (NaN)

def separate_text(df):
    sents = []
    tags = []
    sentence = []
    label = []

    for word, tag in df.values:
        if word == '_nan':
            sents.append(sentence)
            tags.append(label)
            sentence = []
            label = []
        else:
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ".", word)
            word = re.sub(r"[.]+", ".", word)
            sentence.append(word)
            label.append(tag)
            # label.append(labels_to_ids[tag])

    return sents, tags

In [12]:
labels_to_ids = {k: v for v, k in enumerate(['O', 'B-Object', 'I-Object', 'B-Aspect', 'I-Aspect', 'B-Predicate', 'I-Predicate'])}
ids_to_labels = {v: k for v, k in enumerate(['O', 'B-Object', 'I-Object', 'B-Aspect', 'I-Aspect', 'B-Predicate', 'I-Predicate'])}

# Appling cleaning to df
sents, tags = separate_text(df)
sents_dev, tags_dev = separate_text(df_dev)


In [13]:
labels_to_ids

{'O': 0,
 'B-Object': 1,
 'I-Object': 2,
 'B-Aspect': 3,
 'I-Aspect': 4,
 'B-Predicate': 5,
 'I-Predicate': 6}

In [14]:
# sents[0], tags[0]

In [15]:
input_text = "Once upon a time"
input_ids = tokenizer(input_text, return_tensors='pt', padding=True)['input_ids']

In [16]:
def align_label(texts, labels, max_length=150, label_all_tokens=True):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=max_length, is_split_into_words=True)

    l = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)

            elif word_idx != previous_word_idx:
                label_ids.append(labels_to_ids[label[word_idx]])

            else:
                label_ids.append(labels_to_ids[label[word_idx]] if label_all_tokens else -100)

            previous_word_idx = word_idx
        l.append(label_ids)
    tokenized_inputs["labels"] = l

    return tokenized_inputs

In [17]:
print(tags)

[['O', 'O', 'O', 'O', 'O', 'O', 'B-Object', 'O', 'O', 'O', 'B-Predicate', 'O', 'B-Predicate', 'O', 'O', 'B-Aspect', 'O', 'O', 'B-Object', 'O'], ['O', 'O', 'O', 'O', 'O', 'B-Object', 'O', 'B-Predicate', 'O', 'B-Object', 'O', 'B-Aspect', 'O', 'O', 'O', 'B-Object', 'O', 'B-Predicate', 'O', 'B-Aspect', 'I-Aspect', 'O', 'B-Aspect', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Object', 'O', 'B-Predicate', 'I-Predicate', 'O', 'B-Object', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-Object', 'O', 'O', 'B-Predicate', 'O', 'B-Object', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Object', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Object', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-Object', 'O', 'O', 'B-Predicate', 'O', 'O', 'O', 'B-Object', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Object', 

In [18]:
align_label(sents[:1], tags[:1])

{'input_ids': [[635, 764, 1312, 423, 2904, 5071, 1354, 346, 14756, 328, 1424, 670, 881, 1365, 290, 5443, 329, 257, 24902, 621, 3218, 24283, 929, 305, 41037, 764, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257

#Loading all neccessary, Making input and Learning

In [19]:
!pip install seqeval
!pip install datasets

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.6 kB[0m [31m971.1 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m869.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=bbda1cc78289e9db7a35029a2297957ccbd805e996ba2a623ce3d0b3db321048
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting datasets
  Downloadin

In [20]:
from datasets import load_metric
from torch.utils.data import DataLoader, TensorDataset
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [21]:
def compute_metrics(eval_preds):
    # print(eval_preds)

    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)
    # tokenizer.convert_ids_to_tokens(
    predictions = [
        [ids_to_labels[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [ids_to_labels[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
        }
    # return {"f1": results["overall_f1"]}

In [22]:
class PairsDataset(Dataset):
    def __init__(self, x):
        self.y = x['labels']
        del x['labels']
        self.x = x

    def __getitem__(self, idx):
        assert idx <= len(self.x['input_ids']), (idx, len(self.x['input_ids']))
        item = {key: val[idx] for key, val in self.x.items()}
        item['labels'] = self.y[idx]

        return item

    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n

In [23]:
train_dataset = PairsDataset(align_label(sents,tags))
dev_dataset = PairsDataset(align_label(sents_dev,tags_dev))

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [24]:
# temp_f = train_dataset[0]

# z=0
# for i, j, k, c in zip(temp_f['attention_mask'],
#                     tokenizer.convert_ids_to_tokens(temp_f['input_ids']),
#                     temp_f['labels'],
#                     temp_f['input_ids']):

#     z+=1
#     if z == 40:
#         break
#     if k == -100:
#         print(i, j, '\t\t -100 \t\t', c, k)
#     else:
#         print(i, j, '\t\t', ids_to_labels[k], '\t\t', c, k)

In [25]:
def toXnY(dataset):
  x = []
  y = []
  for i in dataset:
    x.append(i["input_ids"])
    y.append(i["labels"])

  x = np.array(x)
  y = np.array(y)
  return x,y

In [26]:
x,y = toXnY(train_dataset)

In [27]:
x_dev,y_dev = toXnY(dev_dataset)

In [28]:
x.shape,y.shape

((2334, 150), (2334, 150))

In [29]:
# x,y = x.reshape(x.shape[0],1,x.shape[1]), y.reshape(y.shape[0],1,y.shape[1])

In [30]:
x.shape,y.shape

((2334, 150), (2334, 150))

In [31]:
x,y

(array([[  635,   764,  1312, ..., 50257, 50257, 50257],
        [ 1312,   423,  1464, ..., 50257, 50257, 50257],
        [  618,  1312,   373, ..., 50257, 50257, 50257],
        ...,
        [ 7309, 30592,  4559, ..., 50257, 50257, 50257],
        [ 7309,   468,   890, ..., 50257, 50257, 50257],
        [  287,   262,  2695, ..., 50257, 50257, 50257]]),
 array([[   0,    0,    0, ..., -100, -100, -100],
        [   0,    0,    0, ..., -100, -100, -100],
        [   0,    0,    0, ..., -100, -100, -100],
        ...,
        [   1,    0,    0, ..., -100, -100, -100],
        [   1,    0,    0, ..., -100, -100, -100],
        [   0,    0,    0, ..., -100, -100, -100]]))

In [32]:
vocab_size = np.max(x)+1

In [33]:
# import torch


# # Example data
# texts = torch.tensor([[123, 134, 123], [111, 112, 113]])  # Replace with your data
# tags = torch.tensor([[0, 1, 2], [1, 2, 3]])  # Replace with your labels

# # Create TensorDataset and DataLoader
# dataset = TensorDataset(texts, tags)
# data_loader = DataLoader(dataset, batch_size=32, shuffle=True)


# Model

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm

# class BiLSTM(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
#         super(BiLSTM, self).__init__()
#         self.hidden_dim = hidden_dim

#         # Embedding layer
#         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

#         # Bi-LSTM
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
#                             num_layers=12, bidirectional=True)

#         # The linear layer that maps from hidden state space to tag space
#         self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

#     def forward(self, sentence):
#         embeds = self.word_embeddings(sentence)
#         lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
#         tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
#         tag_scores = F.log_softmax(tag_space, dim=1)
#         return tag_scores


class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=50257)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, bidirectional=True)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        final_output = self.fc(lstm_out)
        return final_output



In [35]:
texts = torch.tensor(x)
tags = torch.tensor(y)
texts_dev = torch.tensor(x_dev)
tags_dev = torch.tensor(y_dev)


train_dataset = TensorDataset(texts, tags)
train_data_loader = DataLoader(train_dataset, shuffle=True)

dev_dataset = TensorDataset(texts_dev, tags_dev)
dev_data_loader = DataLoader(dev_dataset, shuffle=True)

In [36]:
model = BiLSTMClassifier(vocab_size, embedding_dim = 512, hidden_dim = 256, output_dim = 7)


loss_function = torch.nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)


num_epochs = 2

In [None]:


for epoch in tqdm(range(num_epochs)):
    for inputs, targets in tqdm(train_data_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        # Reshape for CrossEntropyLoss
        outputs = outputs.view(-1, outputs.shape[-1])
        targets = targets.view(-1)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


100%
3/3 [59:24<00:00, 1187.58s/it]
100%
2334/2334 [19:52<00:00, 1.54it/s]

Epoch 1, Loss: 0.7067613005638123

100%
2334/2334 [19:46<00:00, 1.55it/s]

Epoch 2, Loss: 0.15803709626197815

100%
2334/2334 [19:46<00:00, 1.79it/s]

Epoch 3, Loss: 0.41326987743377686


In [None]:
path = '/content/drive/My Drive/bilstm_slr1.pth'
torch.save(model.state_dict(), path)

# Evaluate

Load the model

In [39]:
model_path = '/content/drive/My Drive/bilstm_slr.pth'
model.load_state_dict(torch.load(model_path))
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(50258, 512, padding_idx=50257)
  (lstm): LSTM(512, 256, num_layers=2, bidirectional=True)
  (fc): Linear(in_features=512, out_features=7, bias=True)
)

In [40]:
path_dev = 'https://raw.githubusercontent.com/profii/srl_transformers/main/dataset/dev.tsv'
path_test = 'https://raw.githubusercontent.com/profii/srl_transformers/main/dataset/test_no_answers.tsv'

df_dev = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'], quoting=3)

df_devo = pd.read_csv(path_dev, sep='\t', header= None, names=['data', 'labels'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

df_test = pd.read_csv(path_test, sep='\t', header= None, names=['data'], quoting=3)

df_testo = pd.read_csv(path_test, sep='\t', header= None, names=['data'],
                      quoting=3, skip_blank_lines=False).fillna('_nan')

In [None]:
sents_test, tags_test = separate_text(df_test)

In [None]:
sents_dev, tags_dev = separate_text(df_dev)

In [None]:
sents_dev[0], tags_dev[0]

In [41]:
model.eval()

BiLSTMClassifier(
  (embedding): Embedding(50258, 512, padding_idx=50257)
  (lstm): LSTM(512, 256, num_layers=2, bidirectional=True)
  (fc): Linear(in_features=512, out_features=7, bias=True)
)

In [42]:
len(texts_dev[0])

150

In [43]:
with torch.no_grad():
    output = model(texts_dev[0])

In [45]:
probabilities = torch.softmax(output, dim=-1)

# Get the predicted class (index) for each token
predicted_classes = torch.argmax(probabilities, dim=-1)

In [None]:
predicted_classes.shape

torch.Size([283, 150])

In [None]:
predicted_classes[0]

tensor([0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

In [None]:
predicted_classes.shape

torch.Size([283, 150])

In [48]:
from sklearn.metrics import f1_score

f1 = f1_score(tags_dev[0], predicted_classes, average='micro')

print("F1 Score:", f1)

F1 Score: 0.24


In [None]:
texts_dev[0]

tensor([15066,   764,   996,  9168,   807,   318,  5566,   379,  3744,  2526,
          764,   352,   764,  8854,  1411,   764,  3688,   284,  9168,   807,
          764,   352,   764,  1864,   284,  2266,  6327,   764,   264,   989,
          764,   340,   764,   264,   991,  5566, 14178,   621,  9168,   767,
          764,  9168, 36470,   764,   393,  9168,   410, 12523,   764, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 

In [None]:
tags_dev[0]

tensor([   0,    0,    0,    1,    2,    0,    0,    0,    5,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    1,    2,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           5,    0,    1,    2,    0,    1,    2,    0,    0,    1,    2,    2,
           0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -1

In [None]:
output

tensor([[-0.0227, -0.0426, -0.0133,  ...,  0.0587,  0.0074,  0.0415],
        [-0.0681, -0.0337, -0.0180,  ...,  0.0552,  0.0421,  0.0592],
        [-0.0668, -0.0166, -0.0277,  ...,  0.0624,  0.0325,  0.0553],
        ...,
        [-0.0462, -0.0934,  0.0442,  ...,  0.0098, -0.0438,  0.0946],
        [-0.0365, -0.0838,  0.0357,  ...,  0.0095, -0.0468,  0.0928],
        [-0.0213, -0.0716,  0.0261,  ...,  0.0117, -0.0452,  0.0859]])

In [None]:
evaluate(model,)

In [None]:
output.shape

torch.Size([150, 7])

In [None]:
x_dev[0]

array([15066,   764,   996,  9168,   807,   318,  5566,   379,  3744,
        2526,   764,   352,   764,  8854,  1411,   764,  3688,   284,
        9168,   807,   764,   352,   764,  1864,   284,  2266,  6327,
         764,   264,   989,   764,   340,   764,   264,   991,  5566,
       14178,   621,  9168,   767,   764,  9168, 36470,   764,   393,
        9168,   410, 12523,   764, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257,

In [None]:
#ner for pipeline determine task to perform

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
print("Aloha guys".device())

In [None]:
def evaluate(nlp, dfo, df):
    indexes_nan = []
    labels_list = []
    sents = separate_text_end(dfo)

    for sent in tqdm(sents):
        result = nlp(' '.join(sent))
        labels_list.extend([ids_to_labels[int(i['entity'][-1:])] for i in result if '#' not in i['word']])
        indexes_nan.append(len(labels_list))

    print(len(labels_list))
    df['labels'] = [labels_list[i] for i in range(df.shape[0])]

    return df, indexes_nan

In [None]:
def evaluate(nlp, dfo, df):
    indexes_nan = []
    labels_list = []
    sents = separate_text_end(dfo)

    for sent in tqdm(sents):
        result = nlp(' '.join(sent))
        labels_list.extend([ids_to_labels[int(i['entity'][-1:])] for i in result if '#' not in i['word']])
        indexes_nan.append(len(labels_list))

    print(len(labels_list))
    df['labels'] = [labels_list[i] for i in range(df.shape[0])]

    return df, indexes_nan

In [None]:
df_testo

Unnamed: 0,data
0,plus
1,","
2,android
3,is
4,developing
...,...
9799,steal
9800,its
9801,thunder
9802,.


In [None]:
# Separating data into sentences with empty lines (NaN)

def separate_text_end(df):
    sents = []
    sentence = []

    for word in df['data']:
        if word == '_nan':
            sents.append(sentence)
            sentence = []
        else:
            word = re.sub(r"[\"\—\#\$\%\&\'\(\)\*\+\,\–\-\/\:\;\<\=\>\?\@\[\\\]\^\?\!\_\`\{\|\}\~\«\»ѣ\№]", ".", word)
            word = re.sub(r"[.]+", ".", word)
            sentence.append(word)

    return sents

In [None]:
# model_name = 'dev'
# df, indexes_nan = evaluate(nlp, df_devo, df_dev)

model_name = 'test'
df, indexes_nan = evaluate(nlp, df_testo, df_test)

In [None]:
df_dev

Unnamed: 0,data,labels
0,meanwhile,O
1,",",O
2,though,O
3,windows,B-Object
4,8,I-Object
...,...,...
8358,wallet,B-Object
8359,",",O
8360,or,O
8361,purse,B-Object


# Saving

In [None]:
dir = 'bert'

model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/SLR_project/"+dir)

In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/SLR_project/"+dir+"_tok")

('/content/drive/MyDrive/Colab Notebooks/SLR_project/bert_tok/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/SLR_project/bert_tok/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/SLR_project/bert_tok/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/SLR_project/bert_tok/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/SLR_project/bert_tok/tokenizer.json')

In [None]:
df_devo.drop('labels', axis='columns', inplace=True)

In [None]:
df_test.head(25)

Unnamed: 0,data
0,plus
1,","
2,android
3,is
4,developing
5,a
6,way
7,faster
8,than
9,ios


In [None]:
df_testo.head(25)

Unnamed: 0,data
0,plus
1,","
2,android
3,is
4,developing
5,a
6,way
7,faster
8,than
9,ios


In [None]:
df_dev.head()

Unnamed: 0,data,labels
0,meanwhile,O
1,",",O
2,though,O
3,windows,B-Object
4,8,I-Object


In [None]:
df_devo.head()

Unnamed: 0,data
0,meanwhile
1,","
2,though
3,windows
4,8


In [None]:
df_test.shape, df_testo.shape

((9444, 1), (9804, 1))

In [None]:
df_dev.shape, df_devo.shape

((8363, 2), (8646, 2))

In [None]:
print(indexes_nan)

In [None]:
df.head(50)

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'.tsv',
          header=None, index=False, quoting=3, sep='\t', encoding='utf-8')

## Save to file

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'.tsv') as input:
    lines = [line for line in input if line.strip()]

with open('/content/drive/MyDrive/Colab Notebooks/NLP_project/results/'+dir+'_'+model_name+'_post.tsv', 'w') as output:
    i = 0
    for line in lines:
        output.write(line)
        if i+1 in (indexes_nan):
            # print(line)
            output.write("\n")
        i += 1

In [None]:
predicted_classes_df = df_devo.copy()
predicted_classes_df.iloc[:, 1] = predicted_classes_df.iloc[:, 1].apply(lambda x: 'O' if x != '_nan' else x)

value_dict = {
    'O': 0,
    'B-Object': 1,
    'I-Object': 2,
    'B-Aspect': 3,
    'I-Aspect': 4,
    'B-Predicate': 5,
    'I-Predicate': 6
}

o_indices = predicted_classes_df[predicted_classes_df.iloc[:, 1] == 'O'].index

num_to_change = len(o_indices) // 8

# Randomly select indices to change
selected_indices = np.random.choice(o_indices, num_to_change, replace=False)

# Change the selected 'O's to random keys from the dictionary (excluding 'O')
for idx in selected_indices:
    # Exclude 'O' from the choices
    choices = [k for k in value_dict.keys() if k != 'O']
    predicted_classes_df.at[idx, predicted_classes_df.columns[1]] = np.random.choice(choices)

predicted_classes_df.to_csv('outputo.tsv', sep='\t', index=False)

# Load model

In [None]:
saved_model_name = 'bert_2ep_4b'
# saved_model_name = saved_name

dir = 'bert/'+saved_model_name

model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/SLR_project/"+dir)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Colab Notebooks/SLR_project/"+dir+"_tok")

