<a href="https://colab.research.google.com/github/ryderwishart/biblical-machine-learning/blob/main/ancient_greek_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Adapted from: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [1]:
# uncomment if you want to quickly delete unwanted files from previous model training
# import shutil
# shutil.rmtree('ag_text_classification')

In [2]:
pip install transformers evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from huggingface_hub import notebook_login
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, pipeline
from datasets import load_dataset

In [4]:
data = load_dataset('ryderwishart/nt-move-features')
data



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7943
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 993
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 993
    })
})

In [5]:
data['eval'][0]

{'text': 'καρδίᾳ γὰρ πιστεύεται εἰς δικαιοσύνην ',
 'label': ['speech_act',
  '#predicating',
  'circumstance',
  'subjective_participation_type_tbd',
  '$process',
  'circumstance',
  'alertness',
  'objectifying',
  '#externalization',
  'centrality_tbd',
  'development_marker',
  'supporting']}

In [6]:
# notebook_login()

In [7]:
pretrained = 'pranaydeeps/Ancient-Greek-BERT'
tokenizer = AutoTokenizer.from_pretrained(pretrained)

In [8]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [9]:
tokenized_greek = data.map(preprocess_function, batched=True)



Map:   0%|          | 0/993 [00:00<?, ? examples/s]



In [10]:
tokenized_greek['train'][0]

{'text': 'Βίβλος γενέσεως Ἰησοῦ χριστοῦ υἱοῦ Δαυὶδ υἱοῦ Ἀβραάμ ',
 'label': ['speech_act',
  '#predicating',
  'synchronic',
  'categorized',
  '$category',
  'elaborated',
  'direction_tbd',
  'interactant_indication_tbd'],
 'input_ids': [101,
  14904,
  278,
  2411,
  2781,
  2914,
  4326,
  2371,
  17006,
  27298,
  17006,
  19297,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
accuracy = evaluate.load('accuracy')

In [13]:
all_ls = [*data['test']['label'], *data['train']['label'], *data['eval']['label']]
all_labels_flat = []
for l in all_ls:
    for i in l:
        all_labels_flat.append(i)
len(set(all_labels_flat))
set(all_labels_flat)
all_labels = all_labels_flat

In [14]:

len(all_labels)

84653

In [15]:
set(all_labels)

{'#articulation',
 '#confirmation_request',
 '#externalization',
 '#locution',
 '#mannerism',
 '#morphological_relative',
 '#participation',
 '#positional_association',
 '#predicating',
 '#proposition',
 '#prospective_participation',
 '#quotation',
 '#reception',
 '#relative_differentiation',
 '#resemblance',
 '$category',
 '$complementizer',
 '$extrinsic_quality',
 '$inference',
 '$interjection',
 '$intrinsic_quality',
 '$name',
 '$position',
 '$process',
 '$total_quantifier',
 'accusative_association',
 'actual_comparator',
 'actualization',
 'addressing',
 'advancing',
 'affected',
 'affirmative',
 'alertness',
 'augmented',
 'backtracking',
 'categorized',
 'centrality_tbd',
 'circumstance',
 'clarified',
 'compared',
 'complementary_property',
 'complete',
 'contemplation_tbd',
 'contemplative',
 'contextualized',
 'corrective',
 'dative_association',
 'described',
 'development_marker',
 'differentiated',
 'differentiation_request',
 'direction_tbd',
 'directive',
 'disjunctive',

In [16]:
assert 'inclusive' in all_labels

In [17]:
label2id = {}

In [18]:
label = 0.0

for i in set(all_labels):
  label2id[i] = label
  label += 1

In [19]:
label2id

{'$complementizer': 0.0,
 'extrinsic': 1.0,
 '#participation': 2.0,
 'sequentiality_tbd': 3.0,
 'actualization': 4.0,
 'presumed': 5.0,
 '$process': 6.0,
 'orientation_tbd': 7.0,
 'distance': 8.0,
 'addressing': 9.0,
 'actual_comparator': 10.0,
 'focused': 11.0,
 'differentiation_request': 12.0,
 'affected': 13.0,
 '#articulation': 14.0,
 'compared': 15.0,
 '#morphological_relative': 16.0,
 'superlative': 17.0,
 'factual': 18.0,
 'accusative_association': 19.0,
 'interactant_indication_tbd': 20.0,
 '#locution': 21.0,
 '$category': 22.0,
 'extra_participant': 23.0,
 'complementary_property': 24.0,
 'complete': 25.0,
 'backtracking': 26.0,
 'interjecting': 27.0,
 'genitive_association': 28.0,
 '$extrinsic_quality': 29.0,
 '$interjection': 30.0,
 'inclusive': 31.0,
 '$inference': 32.0,
 'expected': 33.0,
 'direction_tbd': 34.0,
 'proximity': 35.0,
 'indicated': 36.0,
 '$name': 37.0,
 'frequency_tbd': 38.0,
 '#resemblance': 39.0,
 'centrality_tbd': 40.0,
 '#predicating': 41.0,
 'introducin

In [20]:
id2label = {v: k for k, v in label2id.items()}

In [21]:
id2label

{0.0: '$complementizer',
 1.0: 'extrinsic',
 2.0: '#participation',
 3.0: 'sequentiality_tbd',
 4.0: 'actualization',
 5.0: 'presumed',
 6.0: '$process',
 7.0: 'orientation_tbd',
 8.0: 'distance',
 9.0: 'addressing',
 10.0: 'actual_comparator',
 11.0: 'focused',
 12.0: 'differentiation_request',
 13.0: 'affected',
 14.0: '#articulation',
 15.0: 'compared',
 16.0: '#morphological_relative',
 17.0: 'superlative',
 18.0: 'factual',
 19.0: 'accusative_association',
 20.0: 'interactant_indication_tbd',
 21.0: '#locution',
 22.0: '$category',
 23.0: 'extra_participant',
 24.0: 'complementary_property',
 25.0: 'complete',
 26.0: 'backtracking',
 27.0: 'interjecting',
 28.0: 'genitive_association',
 29.0: '$extrinsic_quality',
 30.0: '$interjection',
 31.0: 'inclusive',
 32.0: '$inference',
 33.0: 'expected',
 34.0: 'direction_tbd',
 35.0: 'proximity',
 36.0: 'indicated',
 37.0: '$name',
 38.0: 'frequency_tbd',
 39.0: '#resemblance',
 40.0: 'centrality_tbd',
 41.0: '#predicating',
 42.0: 'intr

In [22]:
def id_mapping_function(examples):
    for i, labelset in enumerate(examples['label']):
        # print(i, examples['text'][i], labelset)
        sample_labels = []
        for label in set(all_labels):
            if label in labelset:
                sample_labels.append(1)
            else:
                sample_labels.append(0)
        # id_for_label = [label2id[label] for label in labelset]
        sample_labels = np.array(sample_labels)
        examples['label'][i] = sample_labels.astype(np.float32).tolist()
    return examples

tokenized_and_id_greek = tokenized_greek.map(id_mapping_function, batched=True)



Map:   0%|          | 0/993 [00:00<?, ? examples/s]



In [23]:
from sklearn.metrics import accuracy_score

In [49]:
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss

def compute_metrics(eval_pred, threshold=0.1):
    predictions, labels = eval_pred
    # print('predictions shape', predictions.shape, 'first prediction shape', predictions[0].shape, 'labels shape', labels.shape)

    # Convert probability predictions into binary predictions using the given threshold
    binary_predictions = (predictions > threshold).astype(int)
    # print('after threshold binary_predictions shape', binary_predictions.shape, 'first prediction shape', binary_predictions[0].shape)

    # Calculate metrics
    f1 = f1_score(labels, binary_predictions, average='samples')
    precision = precision_score(labels, binary_predictions, average='samples')
    recall = recall_score(labels, binary_predictions, average='samples')
    hamming = hamming_loss(labels, binary_predictions)

    return {'f1_score': f1, 'precision': precision, 'recall': recall, 'hamming_loss': hamming}


In [50]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained, num_labels=len(id2label), id2label=id2label, label2id=label2id,
    problem_type = 'multi_label_classification'
)

model.to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pranaydeeps/Ancient-Greek-BERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [54]:
output = 'output'

training_args = TrainingArguments(
    output_dir=output,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    # evaluation_strategy='steps',
    # eval_steps=50,
    # save_strategy='epoch',
    # load_best_model_at_end=True,
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_and_id_greek['train'],
    eval_dataset=tokenized_and_id_greek['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics, # Ryder uncommented
)

In [55]:
# !pip install numba

# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()

In [56]:
trainer.train()



Epoch,Training Loss,Validation Loss,Loss,F1 Score,Precision,Recall,Runtime,Samples Per Second,Steps Per Second
1,0.0865,0.03289,0.09805,0.780856,0.903563,0.712482,4.7994,206.903,51.882


TrainOutput(global_step=1986, training_loss=0.09647883773450285, metrics={'train_runtime': 198.5671, 'train_samples_per_second': 40.002, 'train_steps_per_second': 10.002, 'total_flos': 208385236702800.0, 'train_loss': 0.09647883773450285, 'epoch': 1.0})

In [117]:
# !huggingface-cli login

In [118]:
# model.push_to_hub('ryderwishart/greek_move_features')

In [81]:
eval_data = data['eval']

In [98]:
print(eval_model.device)
model.to('cpu')
print(model.device)

cuda:0
cpu


In [110]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [111]:
classifier.device

device(type='cpu')

In [112]:
predictions = []

for i, sentence in enumerate(eval_data):
    print(f'{i} of {len(eval_data)}')
    sentence_text = sentence['text']
    predictions.append(classifier(sentence_text))
    print(predictions[-1])
    break

0 of 993
[{'label': 'dative_association', 'score': 0.9962292313575745}]


In [109]:
print(eval_data[0]['text'])
print(eval_data[0]['label'])
print(predictions[0])

καρδίᾳ γὰρ πιστεύεται εἰς δικαιοσύνην 
['speech_act', '#predicating', 'circumstance', 'subjective_participation_type_tbd', '$process', 'circumstance', 'alertness', 'objectifying', '#externalization', 'centrality_tbd', 'development_marker', 'supporting']
[{'label': 'dative_association', 'score': 0.9962292313575745}]


In [104]:
len(eval_data)

993

In [None]:
"""
TODO: 
- rather than using the pipeline class, let's just directly submit out inputs to the model
- model(eval_sentence)
- we will need to figure out what format the evaluation sentence needs to be in
- does it need to be tokenized? Should we use tokenized_and_id_greek['eval'] for the sentences?
- do we need to conver the input sentence into a torch or tensorflow tensor? 
- how can we conver the model output back into the labels we want?

"""