In [1]:
# !pip install transformers
# !pip install scikit-learn
# !pip install datasets

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers.legacy import Adam # as tf.keras.optimizers.Adam runs slowly on M1/M2 Macs, I am using legacy Keras optimizer instead

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset, DatasetDict

In [4]:
from data_prepration import get_training_set_data
df = get_training_set_data(seed=22)
# df.to_csv('data/training_set.csv')

In [5]:
df.index = df.index.rename('idx')
df.drop(columns=['question_row_id', 'answer_row_id', 'question_id', 'answer_id'], inplace=True)

In [6]:
train, validation = train_test_split(df, test_size=0.2, random_state=22)

In [7]:
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train),
    'validation': Dataset.from_pandas(validation)
    })

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'label', 'idx'],
        num_rows: 3457
    })
    validation: Dataset({
        features: ['question', 'answer', 'label', 'idx'],
        num_rows: 865
    })
})

In [9]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(record):
    return tokenizer(record["question"], record["answer"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/3457 [00:00<?, ? examples/s]

Map:   0%|          | 0/865 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3457
    })
    validation: Dataset({
        features: ['question', 'answer', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 865
    })
})

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [11]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, device_map="cuda")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
batch_size = 8
num_epochs = 2
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])



In [14]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x2849429b0>

In [15]:
model.save_pretrained("model.h5")