In [6]:
from transformers import TFAutoModelForSequenceClassification
import json 
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
training_file = "./cuad-data/test_classification_data.json"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)

loading configuration file config.json from cache at /Users/saurabhkaushik/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.22.2",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/saurabhkaushik/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file tokenizer.json from cache at /Users/saurabhkaushik/.cache/huggin

In [7]:
from sklearn.model_selection import train_test_split
import pandas as pd
import pyarrow as pa
from datasets import Dataset

def load_data():
    with open(training_file) as json_file:
        data = json.load(json_file)
    dataset = data["train"]
    return dataset

def process_data(row):
    text = row['sentence']
    text = str(text)
    text = ' '.join(text.split())

    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=128)

    #label = row['label']
    label = 0
    if row['label'] == 'positive':
        label += 1

    encodings['label'] = label
    encodings['text'] = text

    return encodings

def prepare_train_valid_df(): 
    processed_data = []
    train_data = load_data()
    for i in range(len(train_data[:1000])):
        processed_data.append(process_data(train_data[i]))

    print (processed_data)
    new_df = pd.DataFrame(processed_data)

    train_df, valid_df = train_test_split(
        new_df,
        test_size=0.2,
        random_state=2022
    )

    train_hg = Dataset(pa.Table.from_pandas(train_df))
    valid_hg = Dataset(pa.Table.from_pandas(valid_df))
    return train_hg, valid_hg

In [8]:
from transformers import TrainingArguments, Trainer
from transformers import create_optimizer

def training(train_hg, valid_hg):
    batch_size = 16

    num_epochs = 1 #3
    batches_per_epoch = len(train_hg) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)

    optimizer, schedule = create_optimizer(
        init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps
    )
    model.compile(optimizer=optimizer)

    model.fit(
        train_hg,
        validation_data=valid_hg,
        epochs=num_epochs
    )

    model.save_pretrained('./model/')
    return model

In [9]:
from transformers import pipeline

def predict(sentences): 
    model = TFAutoModelForSequenceClassification.from_pretrained('./model/')
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    results = classifier(sentences)
    return results 

In [10]:
print(process_data({
    'sentence': 'this is a sample review of a movie.',
    'label': 1
}))

train_hg, valid_hg = prepare_train_valid_df()
model = training(train_hg, valid_hg) 


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


{'input_ids': [101, 2023, 2003, 1037, 7099, 3319, 1997, 1037, 3185, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label': 0, 'text': 'this is a sample review of a movie.'}
[{'input_ids': [101, 5062, 1010, 13202, 1998, 9949, 3615, 2000, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

ValueError: Failed to find data adapter that can handle input: <class 'datasets.arrow_dataset.Dataset'>, <class 'NoneType'>

In [None]:
sentences = [
    {"sentence" : "Licensing and Wireless are referred to "},
    {"sentence" : "The judge told that the jurors to think carefully."}
]

results = predict(sentences[0]["sentence"])

print (results)