In [1]:
import transformers
from transformers import TFAutoModelForSequenceClassification
import tensorflow as tf
from transformers import create_optimizer
from transformers import pipeline
from evaluate import load
from transformers import AutoTokenizer
import json 

GLUE_TASKS = [
    "cola",
    "mnli",
    "mnli-mm",
    "mrpc",
    "qnli",
    "qqp",
    "rte",
    "sst2",
    "stsb",
    "wnli",
]

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}
task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
training_file = "./cuad-data/test_classification_data.json"

def prep_data():
    with open(training_file) as json_file:
        data = json.load(json_file)
    dataset = data
    return dataset

def prepare_data(dataset): 
    actual_task = "mnli" if task == "mnli-mm" else task
    #dataset = load_dataset("glue", actual_task)
    #metric = load("glue", actual_task)
    print ("Dataset >> ", dataset["train"][0])

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    def preprocess_function(examples):
        sentence1_key, sentence2_key = task_to_keys[task]
        if sentence2_key is None:
            print(f"Sentence: {dataset['train'][0][sentence1_key]}")
        else:
            print(f"Sentence 1: {dataset['train'][0][sentence1_key]}")
            print(f"Sentence 2: {dataset['train'][0][sentence2_key]}")
        if sentence2_key is None:
            return tokenizer(examples[sentence1_key], truncation=True)
        return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)

    pre_tokenizer_columns = set(dataset["train"].features)
    encoded_dataset = dataset.map(preprocess_function, batched=True)
    #tokenizer_columns = list(set(encoded_dataset["train"].features) - pre_tokenizer_columns)
    #print("Columns added by tokenizer:", tokenizer_columns)
    return encoded_dataset, tokenizer


dataset = prep_data()
prepare_data(dataset)

  from .autonotebook import tqdm as notebook_tqdm
2022-10-17 15:29:22.260161: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Dataset >>  {'sentence': 'Broadcasting, Licensing and Wireless referred to ', 'label': 1, 'idx': 0}


AttributeError: 'list' object has no attribute 'features'

In [None]:
dataset["train"].features

In [None]:
def training(encoded_dataset, tokenizer): 
    num_labels = 3 if task.startswith("mnli") else 1 if task == "stsb" else 2
    if task == "stsb":
        num_labels = 1
    elif task.startswith("mnli"):
        num_labels = 3
    else:
        num_labels = 2
        
    id2label = {0: "Invalid", 1: "Valid"}
    label2id = {val: key for key, val in id2label.items()}

    model = TFAutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id
    )

    validation_key = (
        "validation_mismatched"
        if task == "mnli-mm"
        else "validation_matched"
        if task == "mnli"
        else "validation"
    )

    tf_train_dataset = model.prepare_tf_dataset(
        encoded_dataset["train"],
        shuffle=True,
        batch_size=16,
        tokenizer=tokenizer
    )

    tf_validation_dataset = model.prepare_tf_dataset(
        encoded_dataset[validation_key],
        shuffle=False,
        batch_size=16,
        tokenizer=tokenizer,
    )

    num_epochs = 1 #3
    batches_per_epoch = len(encoded_dataset["train"]) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)

    optimizer, schedule = create_optimizer(
        init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
    )
    model.compile(optimizer=optimizer)

    model.fit(
        tf_train_dataset,
        validation_data=tf_validation_dataset,
        epochs=num_epochs
    )
    return model



In [None]:
encoded_dataset, tokenizer = prepare_date()

print (">>>>> encoded_dataset", encoded_dataset["train"].features["label"])

model = training(encoded_dataset, tokenizer)

model.push_to_hub("my-finetuned-classification_mode")



In [None]:
def predict(sentences, model): 
    classifier = pipeline("text-classification", model, framework="tf")
    classifier(sentences)

sentences = [
    "The judge told the jurors to think carefully.",
    "The judge told that the jurors to think carefully."
]
predict(sentences, "my-finetuned-classification_mode")

In [None]:
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)

model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
