<a href="https://colab.research.google.com/github/ryderwishart/nlp-model-experiments/blob/main/Greek_BERT_one_word_ln.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [None]:
# uncomment if you want to quickly delete unwanted files from previous model training
import shutil
#shutil.rmtree('/content/MyDrive/MyDrive/output')

Follow the general approach in this POS-tagging notebook: https://github.com/Kyubyong/nlp_made_easy/blob/master/Pos-tagging%20with%20Bert%20Fine-tuning.ipynb

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import unicodedata
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
from collections import Counter
import json
import os

In [3]:
if 'macula-greek.tsv' not in [path for path in os.listdir()]:
    !wget -q 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv'
if 'ln_int_dict.txt' not in [path for path in os.listdir()]:
    !wget -q 'https://raw.githubusercontent.com/ryderwishart/nlp-model-experiments/main/data/ln_int_dict.txt'
if 'lemma_ln_dict.txt' not in [path for path in os.listdir()]:
    !wget -q 'https://raw.githubusercontent.com/ryderwishart/nlp-model-experiments/main/data/lemma_ln_dict.txt'
if 'ln_pos_dict.txt' not in [path for path in os.listdir()]:
    !wget -q 'https://raw.githubusercontent.com/ryderwishart/nlp-model-experiments/main/data/ln_pos_dict.txt'

#Load in data and dictionaries

In [4]:
data ='macula-greek.tsv'
ln_int_dict = 'ln_int_dict.txt'
lemma_ln_dict = 'lemma_ln_dict.txt'
ln_pos_dict = 'ln_pos_dict.txt'

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [6]:
with open(ln_int_dict) as f:
    ln_int_dict = json.load(f)

with open(lemma_ln_dict, encoding="utf8") as f:
    lemma_ln_dict = json.load(f)

with open(ln_pos_dict) as f:
    ln_pos_dict = json.load(f)

In [7]:
# previous input factors: text, lemma
input = 'text'
# previous output factors: ln, type, role, pos
output = 'ln'

In [8]:
data = pd.read_csv(data, sep='\t', keep_default_na=False, encoding='utf-8')
data = data[[input,output]]
data.head()

Unnamed: 0,text,ln
0,Βίβλος,33.38
1,γενέσεως,10.24 33.19
2,Ἰησοῦ,93.169a
3,Χριστοῦ,93.387
4,υἱοῦ,10.30


In [9]:
lns = data['ln'].tolist()

for i in range(len(lns)):
  lns[i] = ln_int_dict[lns[i]]

data = data.drop('ln', axis=1)
data['ln'] = lns
data.head()

Unnamed: 0,text,ln
0,Βίβλος,0
1,γενέσεως,1
2,Ἰησοῦ,2
3,Χριστοῦ,3
4,υἱοῦ,4


# Quick stats

In [None]:
n_words = data['text'].nunique()
print(f'{n_words} unique lemma forms are present')

19477 unique lemma forms are present


In [None]:
sem_doms = data['ln'].nunique()
print(f'{sem_doms} unique semantic domains present')

7815 unique semantic domains present


In [None]:
len(data['text'])

137779

# BERT MODEL

In [None]:
''' 
other models tested include: 
nlpaueb/bert-base-greek-uncased-v1, 
pranaydeeps/Ancient-Greek-BERT, 
bert-base-multilingual-cased, 
bert-base-multilingual-uncased
'''
pretrained = "pranaydeeps/Ancient-Greek-BERT"

In [None]:
tokenizer = BertTokenizer.from_pretrained(pretrained)
model = BertForSequenceClassification.from_pretrained(pretrained,
                                                      num_labels = sem_doms)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pranaydeeps/Ancient-Greek-BERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model = model.to('cuda')

In [None]:
#def strip_accents_and_lowercase(s):
#   return ''.join(c for c in unicodedata.normalize('NFD', s)
#                  if unicodedata.category(c) != 'Mn').lower()

In [None]:
# test a sample sentence to see if everything is working as it should
#sample_data = ['ἀρχὴ τοῦ εὐαγγελίου Ἰησοῦ Χριστοῦ υἱοῦ θεοῦ.']
#for sentence in range(len(sample_data)):
#  clean = strip_accents_and_lowercase(sample_data[sentence])
#  sample_data[sentence] = clean
#tokenizer(sample_data, padding = True, truncation = True, max_length = 512)

{'input_ids': [[101, 664, 346, 23115, 4326, 2371, 17006, 1210, 121, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
# testing what various tokens are
#tokenizer.convert_ids_to_tokens(664)

'αρχη'

In [None]:
X = list(data['text'])
y = list(data['ln'])
# 70/20/10 split for train, val, and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) # 80/20
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.125) # 70/10
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
print(f'{len(X_train)} items in the training set.')
print(f'{len(X_val)} items in the validation set.')
print(f'{len(X_test)} items in the test set.')

96445 items in the training set.
27556 items in the validation set.
13778 items in the test set.


In [None]:
 # Create dataset
 class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
args = TrainingArguments(
    # Change if you want the checkpoints to go somewhere else
    output_dir="/content/MyDrive/MyDrive/output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=2048,
    per_device_eval_batch_size=2048,
    num_train_epochs=10,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,No log,4.283328,0.372587,0.20953,0.372587,0.257747
200,No log,3.235488,0.495065,0.345954,0.495065,0.390494
300,No log,2.709043,0.558535,0.420499,0.558535,0.460937
400,No log,2.390668,0.588692,0.464412,0.588692,0.49751
500,3.466700,2.176357,0.612026,0.497292,0.612026,0.525919
600,3.466700,2.026451,0.627123,0.520664,0.627123,0.544055
700,3.466700,1.917531,0.632167,0.523542,0.632167,0.557657
800,3.466700,1.834535,0.638482,0.531619,0.638482,0.566418
900,3.466700,1.770055,0.64799,0.551958,0.64799,0.573305
1000,1.741200,1.717357,0.649223,0.550897,0.649223,0.580232


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=1920, training_loss=2.004433250427246, metrics={'train_runtime': 1554.5678, 'train_samples_per_second': 2481.59, 'train_steps_per_second': 1.235, 'total_flos': 2.1215506698036e+16, 'train_loss': 2.004433250427246, 'epoch': 40.0})

#TEST

In [None]:
X_test[:5]

In [None]:
# true values
y_test[:5]

In [None]:
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
test_dataset = Dataset(X_test_tokenized)

In [None]:
# CHANGE MODEL PATH TO BEST PERFORMING MODEL
model_path = "/content/output/checkpoint-500"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=sem_doms, ignore_mismatched_sizes=True)

In [None]:
test_trainer = Trainer(model)

In [None]:
raw_pred, _, _ = test_trainer.predict(test_dataset)

In [None]:
y_pred = np.argmax(raw_pred, axis=1)

In [None]:
predictions = y_pred.tolist()

In [None]:
predictions[:5]

#Split Up Data into Groups

Analyze performance between content words (common nouns, proper nouns, adjectives, verbs, adverbs) and function words (i.e. conjunctions and prepositions)


In [None]:
content = []
function = []


for i in range(len(y_test)):
  a = int_ln_dict[str(y_test[i])]
  b = int_ln_dict[str(predictions[i])]
  if ln_pos_dict[a] in ('common noun', 'proper noun', 'adjective', 'verb'):
    content.append(tuple((a, b)))
  else:
    function.append(tuple((a, b)))

Analyze performance between words seen in the training dataset and those only seen in the test dataset

In [None]:
in_train = []
out_train = []

for i in range(len(y_test)):
  a = int_ln_dict[str(y_test[i])]
  b = int_ln_dict[str(predictions[i])]
  if (y_test[i] in y_train) or (y_test[i] in y_val):
    in_train.append(tuple((a, b)))
  else:
    out_train.append(tuple((a, b)))

In [None]:
# adverbs make my brain hurt, so they're just going to be "function" words
pos_list = ['common noun', 'proper noun', 'adjective', 'verb', 'adverb', 
            'determiner', 'conjunction', 'preposition',
              'pronoun', 'particle', 'number', 'interjection']

Analyze performance between ambiguous (lemma occurs with multiple ln domains) and unambiguous (lemma occurs with single ln domain) words

In [None]:
ambiguous_ln = []
ambiguous_words = []
unambiguous_ln = []

for i in range(len(X_test)):
  a = int_ln_dict[str(y_test[i])]
  b = int_ln_dict[str(predictions[i])]
  domains = lemma_ln_dict[X_test[i]]
  if len(domains) > 1:
    ambiguous_ln.append(tuple((a, b)))
    ambiguous_words.append(X_test[i])
  else:
    unambiguous_ln.append(tuple((a, b)))

# Functions

In [None]:
def generate_incorrect_lists(set_list: list) -> list:
  incorrect_list = []

  for i in set_list:
    if i[0] != i[1]:
      incorrect_list.append(i)

  return incorrect_list

In [None]:
def generate_incorrect_ambiguity_list(set_list: list, ambiguous_words: list) -> list:
  incorrect_list = []
  corresponding_domains = []

  for i in range(len(set_list)):
    if set_list[i][0] != set_list[i][1]:
      incorrect_list.append(set_list[i])
      corresponding_domains.append(lemma_ln_dict[ambiguous_words[i]])

  return incorrect_list, corresponding_domains

In [None]:
def count_all_wrong(wrong_list: list, total: list):
  wrong_count = len(wrong_list)
  all_count = len(total)
  accuracy = wrong_count/all_count

  print(f'{wrong_count} domains incorrectly identified out of {all_count}.')
  print(f'{round(accuracy, 4)*100}% of domains incorrectly identified')

In [None]:
def count_wrong_ln_by_pos(pos_list: list, wrong_list: list) -> dict:
  key_list = pos_list
  wrong_pos_counts = {k:0 for k in key_list}
  
  for i in wrong_list:
    pos = ln_pos_dict[i[0][0]]
    wrong_pos_counts[pos] += i[1]
  
  return wrong_pos_counts

In [None]:
def count_all_pos(pos_list: list, set_list: list) -> dict:
  key_list = pos_list
  all_pos_counts = {k:0 for k in key_list}

  try:
    for i in set_list:
      all_pos_counts[ln_pos_dict[i[0]]] += 1
  except:
    for i in set_list:
      all_pos_counts[ln_pos_dict[int_ln_dict[str(i)]]] += 1
    
  return all_pos_counts

In [None]:
def create_pos_stats(wrong_pos_counts: dict, all_pos_counts: dict) -> list:   
  stats = []
  for i in wrong_pos_counts:
    if all_pos_counts[i] == 0:
          stats.append('{} out of {} {}s incorrect (0.0)'
              .format(wrong_pos_counts[i], all_pos_counts[i], i))
    else:
        stats.append('{} out of {} {}s incorrect ({})'
              .format(wrong_pos_counts[i], all_pos_counts[i], i, 
                      round((wrong_pos_counts[i]/all_pos_counts[i]), 4)))

  return stats

In [None]:
def return_relevant_stats(pos_type: str, stats: list):
  pos_type = pos_type.lower()
  # if pos_list changes, adjust where list is split
  pos_dict = {
      'content': stats[:4],
      'function': stats[4:],
      'both': stats
  }

  if pos_type in pos_dict:
    for i in pos_dict[pos_type]:
      print(i)
  else:
    print('Please enter "content", "function", or "both" for pos_type.')

#Content word analysis

In [None]:
incorrect_content = generate_incorrect_lists(content)
incorrect_content[:5]

In [None]:
count_all_wrong(incorrect_content, content)

In [None]:
cont_counts = Counter(incorrect_content).most_common()
cont_counts[:5]

In [None]:
cont_wrong = count_wrong_ln_by_pos(pos_list, cont_counts)
cont_all = count_all_pos(pos_list, y_test)
cont_stats = create_pos_stats(cont_wrong, cont_all)
return_relevant_stats('content', cont_stats)

#Function Word Analysis

In [None]:
incorrect_function = generate_incorrect_lists(function)
incorrect_function[:5]

In [None]:
count_all_wrong(incorrect_function, function)

In [None]:
func_counts = Counter(incorrect_function).most_common()
func_counts[:5]

In [None]:
func_wrong = count_wrong_ln_by_pos(pos_list, func_counts)
func_all = count_all_pos(pos_list, y_test)
func_stats = create_pos_stats(func_wrong, func_all)
return_relevant_stats('function', func_stats)

# In Training Analysis

In [None]:
in_train[:5]

In [None]:
incorrect_in = generate_incorrect_lists(in_train)
count_all_wrong(incorrect_in, in_train)

In [None]:
in_counts = Counter(incorrect_in).most_common()
in_counts[:5]

In [None]:
in_wrong = count_wrong_ln_by_pos(pos_list, in_counts)
in_all = count_all_pos(pos_list, in_train)
in_stats = create_pos_stats(in_wrong, in_all)
return_relevant_stats('both', in_stats)

# Out Of Training Analysis

In [None]:
out_train[:5]

In [None]:
incorrect_out = generate_incorrect_lists(out_train)
count_all_wrong(incorrect_out, out_train)

In [None]:
out_counts = Counter(incorrect_out).most_common()
out_counts[:5]

In [None]:
out_wrong = count_wrong_ln_by_pos(pos_list, out_counts)
out_all = count_all_pos(pos_list, out_train)
out_stats = create_pos_stats(out_wrong, out_all)
return_relevant_stats('both', out_stats)

#Ambiguous Words

In [None]:
ambiguous_ln[:5]

In [None]:
incorrect_amb, amb_ln = generate_incorrect_ambiguity_list(ambiguous_ln, ambiguous_words)
count_all_wrong(incorrect_amb, ambiguous_ln)

In [None]:
amb_counts = Counter(incorrect_amb).most_common()
amb_counts[:5]

In [None]:
amb_wrong = count_wrong_ln_by_pos(pos_list, amb_counts)
amb_all = count_all_pos(pos_list, ambiguous_ln)
amb_stats = create_pos_stats(amb_wrong, amb_all)
return_relevant_stats('both', amb_stats)

Find the instances of these ln domains and see if the model is simply guessing the most common domain for these ambigous words.

In [None]:
common_counts = 0
not_common_counts = 0
for i in amb_counts:
  j = incorrect_amb.index(i[0])
  current_dict = amb_ln[j]
  most_common = max(current_dict, key = current_dict.get)
  if most_common == i[0][1]:
    common_counts += i[1]
  else:
    not_common_counts += i[1]

print(f'model incorrectly guessed the most common domain {common_counts} times')
print(f'model incorrectly guessed something besides the most common domain {not_common_counts} times')

#Unambiguous Words

In [None]:
unambiguous_ln[:5]

In [None]:
incorrect_unamb = generate_incorrect_lists(unambiguous_ln)
count_all_wrong(incorrect_unamb, unambiguous_ln)

In [None]:
unamb_counts = Counter(incorrect_unamb).most_common()
unamb_counts[:5]

In [None]:
unamb_wrong = count_wrong_ln_by_pos(pos_list, unamb_counts)
unamb_all = count_all_pos(pos_list, unambiguous_ln)
unamb_stats = create_pos_stats(unamb_wrong, unamb_all)
return_relevant_stats('both', unamb_stats)

#Unfiltered Test Analsis

In [None]:
mismatches = []

for i in range(len(y_test)):
  if y_test[i] != predictions[i]:
    a = int_ln_dict[str(y_test[i])]
    b = int_ln_dict[str(predictions[i])]
    mismatches.append(tuple((a, b)))

mismatches[:5]

In [None]:
all_wrong = generate_incorrect_lists(mismatches)
count_all_wrong(all_wrong, y_test)

In [None]:
all_counts = Counter(mismatches).most_common()
all_counts[:5]

In [None]:
wrong_counts = count_wrong_ln_by_pos(pos_list, all_counts)
all_counts = count_all_pos(pos_list, y_test)
all_stats = create_pos_stats(wrong_counts, all_counts)
return_relevant_stats('both', all_stats)