In [None]:
!git clone https://github.com/OnlpLab/NEMO-Corpus.git

In [None]:
!pip install transformers

In [None]:
from transformers import BertModel, BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('onlplab/alephbert-base')
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("onlplab/alephbert-base", num_labels=131)
model.save_pretrained("./initial_pretrained")

In [None]:
!ls -latr ./initial_pretrained

In [None]:
# !cat ./NEMO-Corpus/data/spmrl/gold/token-multi_gold_train.bmes | cut -d " " -f 1,3 > train_temp.txt
# !cat ./NEMO-Corpus/data/spmrl/gold/token-multi_gold_dev.bmes | cut -d " " -f 1,3 > dev_temp.txt
# !cat ./NEMO-Corpus/data/spmrl/gold/token-multi_gold_test.bmes | cut -d " " -f 1,3 > test_temp.txt

In [None]:
dataset = {
            "name": "NEMO Corpus",
            "train_path": "./NEMO-Corpus/data/spmrl/gold/token-multi_gold_train.bmes",
            "dev_path": "./NEMO-Corpus/data/spmrl/gold/token-multi_gold_dev.bmes",
            "test_path": "./NEMO-Corpus/data/spmrl/gold/token-multi_gold_test.bmes",
            'classes': []
          }

In [None]:
!cat NEMO-Corpus/data/spmrl/gold/token-multi_gold_train.bmes NEMO-Corpus/data/spmrl/gold/token-multi_gold_dev.bmes NEMO-Corpus/data/spmrl/gold/token-multi_gold_test.bmes | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt

In [None]:
labels = []
with open('labels.txt', 'r') as file:
    for line in file:
        labels.append(line.strip())
print(labels)
dataset['classes'] = labels
print(len(labels))
print(labels[61])

In [None]:
import pandas as pd

def read_data():
    train = pd.read_csv(dataset['train_path'], sep=' ', engine='python',quoting=3, encoding='utf-8', error_bad_lines=False, names=['word', 'label'])
    dev = pd.read_csv(dataset['dev_path'], sep=' ', engine='python',quoting=3, encoding='utf-8', error_bad_lines=False, names=['word', 'label'])
    test = pd.read_csv(dataset['test_path'], sep=' ', engine='python',quoting=3, encoding='utf-8', error_bad_lines=False, names=['word', 'label'])
    return train, dev, test
train, dev, test = read_data()
# train.to_csv("train_example.csv")

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(labels)

In [None]:
label_encoder.inverse_transform([61])

In [None]:
train_encodings = tokenizer(train["word"].to_list()[:5000], truncation=True, padding=True)
dev_encodings = tokenizer(dev["word"].to_list()[:800], truncation=True, padding=True)
test_encodings = tokenizer(test["word"].to_list()[:1500], truncation=True, padding=True)
train_labels=label_encoder.transform(train["label"].to_list()[:5000])
dev_labels=label_encoder.transform(dev["label"].to_list()[:800])
test_labels=label_encoder.transform(test["label"].to_list()[:1500])
print(dev_labels)

In [None]:
print(len(train_encodings['input_ids']))
print(train_labels)
print(test_labels)
print(dev_labels)
# print(train["word"].to_list())

In [None]:
import torch

class HebrewNERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = HebrewNERDataset(train_encodings, train_labels)
dev_dataset = HebrewNERDataset(dev_encodings, dev_labels)
test_dataset = HebrewNERDataset(test_encodings, test_labels)

In [None]:
train_dataset.__getitem__(2)

In [None]:
!pip install wandb

In [None]:
CUDA_LAUNCH_BLOCKING=1
from transformers import Trainer,TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset             # evaluation dataset
)

trainer.train()
trainer.save_model("./alephbert_ner")

In [None]:
!ls -latr ./alephbert_ner/

In [None]:
import numpy as np

In [None]:
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [None]:
y_pred

In [None]:
test_dataset.labels

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score
count_equals=0
for a,b in zip(test_dataset.labels, y_pred):
    if a==b:
        count_equals+=1
print(f"accuracy={count_equals/len(y_pred)}")

def evaluate(y_test, predicted):
    print("Recall Macro: " + str(recall_score(y_test, predicted, average='macro')))
    print("Precision Macro: " + str(precision_score(y_test, predicted, average='macro')))
    print("F1 Macro: " + str(f1_score(y_test, predicted, average='macro')))
    print("Recall Micro: " + str(recall_score(y_test, predicted, average='micro')))
    print("Precision Micro: " + str(precision_score(y_test, predicted, average='micro')))
    print("F1 Micro: " + str(f1_score(y_test, predicted, average='micro')))
    print("F1: " + str(f1_score(y_test, predicted, average='weighted')))
    
evaluate(test_dataset.labels, y_pred)
res = (list(filter(lambda x: x[1] != 61, list(zip(list(test_dataset.labels), list(y_pred))))))
test_no_o, pred_no_o = list(zip(*res))
print("F1: " + str(f1_score(test_no_o, pred_no_o, average='micro')))

In [None]:
test_sent = "אלברט איינשטיין נולד ב גרמניה ו גר ב שווייץ"
test_sent = test_sent.split(" ")
test_sent = tokenizer(test_sent, truncation=True, padding=True)
test_sent = HebrewNERDataset(test_sent, [0, 0, 0, 0, 0, 0])
test_sent_pred, _, _ = trainer.predict(test_sent)
test_sent_pred = np.argmax(test_sent_pred, axis=1)
label_encoder.inverse_transform(test_sent_pred)

In [None]:
test_s = "אלברט איינשטיין נולד בגרמניה בחודש מרץ בשנת 1879 והיה מדען מפורסם שזכה בפרס נובל"
test_s = "אלברט איינשטיין נולד ב גרמניה ו גר ב שווייץ"
test_s = test_s.split(" ")
test_s_tokenized = tokenizer(test_s, truncation=True, padding=True)
test_s_dataset = HebrewNERDataset(test_s_tokenized, [0 for i in range(len(test_s))])
test_s_pred, _, _ = trainer.predict(test_s_dataset)
test_s_pred = np.argmax(test_s_pred, axis=1)
label_encoder.inverse_transform(test_s_pred)

In [None]:
test_s = "אלברט איינשטיין זכה בפרס נובל"
test_s = test_s.split(" ")
test_s_tokenized = tokenizer(test_s, truncation=True, padding=True)
test_s_dataset = HebrewNERDataset(test_s_tokenized, [0 for i in range(len(test_s))])
test_s_pred, _, _ = trainer.predict(test_s_dataset)
test_s_pred = np.argmax(test_s_pred, axis=1)
label_encoder.inverse_transform(test_s_pred)

In [None]:
test_s = 'אברהם נדל יסד את חברת אגד ועבד כנהג אוטובוס בישראל'
test_s = test_s.split(" ")
test_s_tokenized = tokenizer(test_s, truncation=True, padding=True)
test_s_dataset = HebrewNERDataset(test_s_tokenized, [0 for i in range(len(test_s))])
test_s_pred, _, _ = trainer.predict(test_s_dataset)
test_s_pred = np.argmax(test_s_pred, axis=1)
label_encoder.inverse_transform(test_s_pred)

In [None]:
# import shutil
# shutil.make_archive('kaggle_dir', 'zip', '/kaggle/working')

In [None]:
# from IPython.display import FileLink
# FileLink('./kaggle_dir.zip')