In [None]:
!pip install transformers evaluate datasets

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
import tensorflow as tf
from datasets import Dataset

In [None]:
filepath = 'website_classification.csv'
data = pd.read_csv(filepath, sep=',', index_col=0)

In [None]:
data.head()

In [None]:
data.drop("website_url", axis=1, inplace=True)

In [None]:
data.columns = ['text', 'label']

In [None]:
data

In [None]:
unique_labels = np.unique(data['label'])
labels_count = len(unique_labels)
ids= range(len(unique_labels))
id2label = {k: v for k,v in zip(ids,unique_labels)}
label2id = {v: k for k,v in zip(ids,unique_labels)}

In [None]:
dataset = Dataset.from_pandas(data)
model_name='distilbert-base-uncased'

In [None]:
# le = LabelEncoder()

# # Fit the label encoder and transform the labels in the DataFrame
# data["label"] = le.fit_transform(data["label"])


# Convert dataframe to a Hugging Face Dataset
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Tokenize the text data
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

encoded_dataset = dataset.map(preprocess, batched=True)

# Split into train and test
dataset = encoded_dataset.train_test_split(test_size=0.2)

In [None]:
# Convert the labels to np.int32 data type
def format_dataset(dataset):
    dataset.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'label'])
    inputs = {
        'input_ids': np.array(dataset['input_ids'], dtype=np.int32),
        'attention_mask': np.array(dataset['attention_mask'], dtype=np.int32)
    }
    labels = np.array(dataset['label'], dtype=str)
    return tf.data.Dataset.from_tensor_slices((inputs, labels))

train_dataset = format_dataset(dataset['train'])
test_dataset = format_dataset(dataset['test'])

# Define the model


In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=labels_count, id2label=id2label, label2id=label2id)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers.keras_callbacks import KerasMetricCallback
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=test_dataset)
callbacks = [metric_callback]

In [None]:
model.fit(train_dataset.batch(16), epochs=3, validation_data=test_dataset.batch(32), callbacks=callbacks)


Inference

In [None]:
sentences = [
    "The judge told the jurors to think carefully.",
    "The judge told that the jurors to think carefully."
]

In [None]:
tokenized = tokenizer(sentences, return_tensors="np", padding="longest")

outputs = model(tokenized).logits

classifications = np.argmax(outputs, axis=1)
print(classifications)

In [None]:
classifications = [model.config.id2label[output] for output in classifications]
print(classifications)