In [37]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

In [38]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [92]:
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder

# Sample dictionary
data = {
    "AC/DC": "Rock",
    "Metallica": "Rock",
    "Opeth": "Rock",
    "Black Sabbath": "Rock",
    "Death": "Rock",
    "Eminem": "Hiphop",
    "Dr Dre": "Hiphop",
    "Drake": "Hiphop",
    "Kendrick": "Hiphop",
    "Playboi Cardi": "Hiphop"
}

# Convert the dictionary to a format suitable for Dataset
formatted_data = {
    'artist': list(data.keys()),
    'genre': list(data.values())
}

# Create a Dataset from the formatted data
dataset = Dataset.from_dict(formatted_data)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the genres
encoded_genres = label_encoder.fit_transform(formatted_data['genre'])

# Add the encoded genres as a new column to the existing dataset
dataset = dataset.add_column('label', encoded_genres)

# Create a DatasetDict (even if there's only one split)
dataset_dict = DatasetDict({
    'all_data': dataset
})

dataset_dict


DatasetDict({
    all_data: Dataset({
        features: ['artist', 'genre', 'label'],
        num_rows: 10
    })
})

In [63]:
import pandas as pd

data = {
    "AC/DC": "Rock",
    "Metallica": "Rock",
    "Opeth": "Rock",
    "Black Sabbath": "Rock",
    "Death": "Rock",
    "Eminem": "Hiphop",
    "Dr Dre": "Hiphop",
    "Drake": "Hiphop",
    "Kendrick": "Hiphop",
    "Playboi Cardi": "Hiphop"
}

# Convert the dictionary to a DataFrame
dataset = pd.DataFrame(list(data.items()), columns=['Artist', 'Genre'])


dataset


Unnamed: 0,Artist,Genre
0,AC/DC,Rock
1,Metallica,Rock
2,Opeth,Rock
3,Black Sabbath,Rock
4,Death,Rock
5,Eminem,Hiphop
6,Dr Dre,Hiphop
7,Drake,Hiphop
8,Kendrick,Hiphop
9,Playboi Cardi,Hiphop


In [68]:
def tokenize(batch):
    return tokenizer(batch['artist'], padding=True, truncation=True)

In [69]:
encoded_music = dataset_dict.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [70]:
encoded_music

DatasetDict({
    all_data: Dataset({
        features: ['artist', 'genre', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10
    })
})

In [71]:
import tensorflow as tf

BATCH_SIZE = 64

def order(features, labels):
    '''
    This function will group all the inputs of BERT
    into a single dictionary and then output it with
    labels.
    '''
    return {
        'input_ids': features['input_ids'],
        'attention_mask': features['attention_mask'],
        'token_type_ids': features['token_type_ids']
    }, labels

def prepare_dataset(data):
    features = {key: [d[key] for d in data] for key in data[0].keys() if key != 'label'}
    labels = [d['label'] for d in data]
    
    features = {key: tf.constant(value) for key, value in features.items()}
    labels = tf.constant(labels)
    
    return tf.data.Dataset.from_tensor_slices((features, labels))

train_data = encoded_music['all_data']

train_dataset = prepare_dataset(train_data)
train_dataset = train_dataset.shuffle(1000).batch(BATCH_SIZE)
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)


In [72]:
inp, out = next(iter(train_dataset))
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(10, 7), dtype=int32, numpy=
array([[  101, 12392,  2050,   102,     0,     0,     0],
       [  101,  2377,  5092,  2072,  4003,  2072,   102],
       [  101,  7867,   102,     0,     0,     0,     0],
       [  101, 25341,   102,     0,     0,     0,     0],
       [  101,  2304, 19546,   102,     0,     0,     0],
       [  101, 12495, 25832,   102,     0,     0,     0],
       [  101,  2331,   102,     0,     0,     0,     0],
       [  101,  2852,  2852,  2063,   102,     0,     0],
       [  101,  6728, 11031,   102,     0,     0,     0],
       [  101,  9353,  1013,  5887,   102,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(10, 7), dtype=int32, numpy=
array([[1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 0, 0, 0],
       [1, 1, 1, 1, 1

In [73]:
class BERTForClassification(tf.keras.Model):

    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes,activation='softmax')

    def call(self, inputs):
        x=self.bert(inputs)[1]
        return self.fc(x)

In [74]:
classifier = BERTForClassification(model, num_classes=6)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [75]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [95]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Initialize the tokenizer and model (ensure the model is for binary classification)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
classifier = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Ensure num_labels=2 for binary classification

# Example input string
input_string = "AC/DC"

# Tokenize the input string
inputs = tokenizer(input_string, return_tensors="tf", padding=True, truncation=True, max_length=512)

# Ensure the inputs are compatible with the model's expected input format
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Make predictions using the classifier model
outputs = classifier(input_ids, attention_mask=attention_mask)

# Extract logits from the output tensor
logits = outputs.logits  # Ensure to use the correct attribute

# Get the predicted label
predictions = tf.argmax(logits, axis=-1).numpy()

# Decode the predicted label using LabelEncoder
predicted_label = label_encoder.inverse_transform(predictions)[0]

print(f"Predicted label for '{input_string}': {predicted_label}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted label for 'AC/DC': Rock


In [96]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Initialize the tokenizer and model (ensure the model is for binary classification)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
classifier = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Ensure num_labels=2 for binary classification

# Example input string
input_string = "AC/DC"

# Tokenize the input string
inputs = tokenizer(input_string, return_tensors="tf", padding=True, truncation=True, max_length=512)

# Ensure the inputs are compatible with the model's expected input format
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

# Make predictions using the classifier model
outputs = classifier(input_ids, attention_mask=attention_mask)

# Extract logits from the output tensor
logits = outputs.logits  # Ensure to use the correct attribute

# Get the predicted label
predictions = tf.argmax(logits, axis=-1).numpy()

# Decode the predicted label using LabelEncoder
predicted_label = label_encoder.inverse_transform(predictions)[0]

print(f"Predicted label for '{input_string}': {predicted_label}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted label for 'AC/DC': Hiphop


In [32]:
# Initialize lists to store tokenized values
input_ids = []
token_type_ids = []
attention_mask = []

# Tokenize each artist name and store the results
for artist in dataset['Artist']:
    tokens = tokenizer(artist, padding=True, truncation=True)
    input_ids.append(tokens['input_ids'])
    token_type_ids.append(tokens['token_type_ids'])
    attention_mask.append(tokens['attention_mask'])

# Create a new DataFrame with the tokenized values
dataset_encoded = pd.DataFrame({
    'input_ids': input_ids,
    'token_type_ids': token_type_ids,
    'attention_mask': attention_mask
})

dataset_encoded

Unnamed: 0,input_ids,token_type_ids,attention_mask
0,"[101, 9353, 1013, 5887, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
1,"[101, 12392, 2050, 102]","[0, 0, 0, 0]","[1, 1, 1, 1]"
2,"[101, 6728, 11031, 102]","[0, 0, 0, 0]","[1, 1, 1, 1]"
3,"[101, 2304, 19546, 102]","[0, 0, 0, 0]","[1, 1, 1, 1]"
4,"[101, 2331, 102]","[0, 0, 0]","[1, 1, 1]"
5,"[101, 12495, 25832, 102]","[0, 0, 0, 0]","[1, 1, 1, 1]"
6,"[101, 2852, 2852, 2063, 102]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]"
7,"[101, 7867, 102]","[0, 0, 0]","[1, 1, 1]"
8,"[101, 25341, 102]","[0, 0, 0]","[1, 1, 1]"
9,"[101, 2377, 5092, 2072, 4003, 2072, 102]","[0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1]"


In [36]:
import tensorflow as tf

BATCH_SIZE = 64

def order(features, labels):
    '''
    This function will group all the inputs
    into a single dictionary and then output it with labels.
    '''
    # Initialize lists to store tokenized values
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    # Tokenize each artist name and store the results
    for artist in dataset['Artist']:
        tokens = tokenizer(artist, padding=True, truncation=True)
        input_ids.append(tokens['input_ids'])
        token_type_ids.append(tokens['token_type_ids'])
        attention_mask.append(tokens['attention_mask'])
    
    # Create a new DataFrame with the tokenized values
    dataset_encoded = pd.DataFrame({
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    })
    
    features=dataset_encoded
    return features, labels

def prepare_dataset(data):
    # Separate features and labels
    features = list(data.keys())
    labels = list(data.values())
    
    # Convert features and labels to tensor
    features = tf.constant(features)
    labels = tf.constant(labels)
    
    return tf.data.Dataset.from_tensor_slices((features, labels))

data = {
    "AC/DC": "Rock",
    "Metallica": "Rock",
    "Opeth": "Rock",
    "Black Sabbath": "Rock",
    "Death": "Rock",
    "Eminem": "Hiphop",
    "Dr Dre": "Hiphop",
    "Drake": "Hiphop",
    "Kendrick": "Hiphop",
    "Playboi Cardi": "Hiphop"
}

# Prepare the dataset
dataset = prepare_dataset(data)

# Shuffle and batch the dataset
dataset = dataset.shuffle(1000).batch(BATCH_SIZE)

# Map the order function (though it's not necessary for this simpler data structure)
dataset = dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# Iterate through the dataset and print the results
for features, labels in dataset:
    print("Features:", features.numpy())
    print("Labels:", labels.numpy())


TypeError: in user code:

    File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5368\3752943773.py", line 16, in order  *
        for artist in dataset['Artist']:

    TypeError: '_BatchDataset' object is not subscriptable


In [None]:
inp,out = next(iter(dataset))
print(inp, '\n\n', out)