In [3]:
%run -i "../util/lang_utils.ipynb"

In [4]:
from datasets import load_dataset, Dataset, Features, Value, ClassLabel, Sequence, DatasetDict
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.model_selection import train_test_split
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
music_ner_df = pd.read_csv('../data/music_ner.csv')
def change_label(input_label):
    input_label = input_label.replace("_deduced", "")
    return input_label
music_ner_df["label"] = music_ner_df["label"].apply(change_label)
music_ner_df["text"] = music_ner_df["text"].apply(lambda x: x.replace("|", ","))
print(music_ner_df)

        id                                               text  start_offset  \
0    13434  i love radioheads kid a something similar , ki...             7   
1    13434  i love radioheads kid a something similar , ki...            61   
2    13435                anything similar to i fight dragons            20   
3    13436                music similar to ccrs travelin band            17   
4    13437                 songs similar to blackout by boris            17   
..     ...                                                ...           ...   
422  14028  songs like good news by mac miller , preferrab...            11   
423  14028  songs like good news by mac miller , preferrab...            24   
424  14030  something along the lines of either the chain ...            49   
425  14030  something along the lines of either the chain ...            29   
426  14032       heavy bass x gothic rap like oxygen by bones            29   

     end_offset          label  
0            17   

In [6]:
ids = list(set(music_ner_df["id"].values))
docs = {}
for id in ids:
    entity_rows = music_ner_df.loc[music_ner_df['id'] == id]
    text = entity_rows.head(1)["text"].values[0]
    doc = small_model(text)
    ents = []
    for index, row in entity_rows.iterrows():
        label = row["label"]
        start = row["start_offset"]
        end = row["end_offset"]
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        ents.append(span)
    doc.ents = ents
    docs[doc.text] = doc

In [7]:
data_file = "../data/music_ner_bio.bio"
tag_mapping = {"O": 0, "B-Artist": 1, "I-Artist": 2, "B-WoA": 3, "I-WoA": 4}
with open(data_file) as f:
    data = f.read()
tokens = []
ner_tags = []
spans = []
sentences = data.split("\n\n")
for sentence in sentences:
    words = []
    tags = []
    this_sentence_spans = []
    word_tag_pairs = sentence.split("\n")
    for pair in word_tag_pairs:
        (word, tag) = pair.split("\t")
        words.append(word)
        tags.append(tag_mapping[tag])
    sentence_text = " ".join(words)
    try:
        doc = docs[sentence_text]
    except:
        pass
    ent_dict = {}
    for ent in doc.ents:
        this_sentence_spans.append(f"{ent.label_}: {ent.text}")
    tokens.append(words)
    ner_tags.append(tags)
    spans.append(this_sentence_spans)


In [8]:
indices = range(0, len(spans))
train, test = train_test_split(indices, test_size=0.1)
train_tokens = []
test_tokens = []
train_ner_tags = []
test_ner_tags = []
train_spans = []
test_spans = []
for i, (token, ner_tag, span) in enumerate(zip(tokens, ner_tags, spans)):
    if i in train:
        train_tokens.append(token)
        train_ner_tags.append(ner_tag)
        train_spans.append(span)
    else:
        test_tokens.append(token)
        test_ner_tags.append(ner_tag)
        test_spans.append(span)        
        
print(len(train_spans))
print(len(test_spans))

539
60


In [9]:
training_df = pd.DataFrame({"tokens":train_tokens, "ner_tags": train_ner_tags, "spans": train_spans})
test_df = pd.DataFrame({"tokens": test_tokens, "ner_tags": test_ner_tags, "spans": test_spans})
training_df["text"] = training_df["tokens"].apply(lambda x: " ".join(x))
test_df["text"] = test_df["tokens"].apply(lambda x: " ".join(x))
training_df.dropna()
test_df.dropna()
print(test_df)

                                               tokens  \
0          [music, similar, to, ccrs, travelin, band]   
1   [aything, similar, to, radioheads, everything,...   
2                  [i, am, looking, for, sexy, music]   
3             [desert, aesthetic, intrumental, music]   
4   [im, looking, for, some, new, music, artists, ...   
5                      [songs, for, my, grandparents]   
6         [looking, for, calm, ghost, related, songs]   
7   [rock, bands, with, african, themes, instruments]   
8   [i, need, albums, to, listen, to, on, a, snowy...   
9                  [suggest, me, new, exiting, music]   
10  [what, are, some, other, artists, like, bishop...   
11  [seeking, music, with, wierd, &, experiemental...   
12  [any, music, thats, more, of, an, emotional, o...   
13  [i, m, looking, for, similar, songs, to, this,...   
14  [im, looking, for, a, blend, between, metal, a...   
15  [dont, know, what, to, call, this, genre, of, ...   
16          [clean, weight, tra

In [10]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
#model = AutoModel.from_pretrained("bert-base-cased")
features = Features({'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 
            'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-Artist', 'I-Artist', 'B-WoA', 'I-WoA'], id=None), length=-1, id=None), 
            'spans': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
            'text': Value(dtype='string', id=None)
                    })
training_dataset = Dataset.from_pandas(training_df, features=features)
test_dataset = Dataset.from_pandas(test_df, features=features)
dataset = DatasetDict({"train":training_dataset, "test":test_dataset}) 
print(dataset["train"].features)
label_names = dataset["train"].features["ner_tags"].feature.names
print(dataset)



{'tokens': List(Value('string')), 'ner_tags': List(ClassLabel(names=['O', 'B-Artist', 'I-Artist', 'B-WoA', 'I-WoA'])), 'spans': List(Value('string')), 'text': Value('string')}
DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'spans', 'text'],
        num_rows: 539
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'spans', 'text'],
        num_rows: 60
    })
})


In [11]:
def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["text"])
    total_adjusted_labels = []
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        i = -1
        adjusted_label_ids = []
        for wid in word_ids_list:
            if (wid is None):
                adjusted_label_ids.append(-100)
            elif (wid != prev_wid):
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                prev_wid = wid
            else:
                label_name = label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])
        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

In [12]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)

Map: 100%|██████████| 539/539 [00:00<00:00, 34912.05 examples/s]
Map: 100%|██████████| 60/60 [00:00<00:00, 4905.52 examples/s]


In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [14]:
metric = load("seqeval")
def compute_metrics(data):
    predictions, labels = data
    predictions = np.argmax(predictions, axis=2)

    data = zip(predictions, labels)
    data = [[(p, l) for (p, l) in zip(prediction, label) if l != -100] for prediction, label in data]

    true_predictions = [[label_names[p] for (p, l) in data_point] for data_point in data]
    true_labels = [[label_names[l] for (p, l) in data_point] for data_point in data]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    flat_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
      if (k not in flat_results.keys()):
        flat_results[k + "_f1"] = results[k]["f1"]

    return flat_results

In [20]:
import transformers
print(transformers.__version__)


4.57.6


In [23]:
import accelerate
print(accelerate.__version__)

1.12.0


In [24]:
import transformers, accelerate, torch

print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)
print("torch:", torch.__version__)


transformers: 4.57.6
accelerate: 1.12.0
torch: 2.9.1+cpu


In [15]:
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)

In [16]:
# Train model
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir="./fine_tune_bert_output",
    evaluation_strategy="steps",         # ✅
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_steps=1000,
    run_name="ep_10_tokenized_11",
    save_strategy="no"

)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 238/238 [03:21<00:00,  1.18it/s]

{'train_runtime': 201.7727, 'train_samples_per_second': 18.699, 'train_steps_per_second': 1.18, 'train_loss': 0.26647687960071725, 'epoch': 7.0}





TrainOutput(global_step=238, training_loss=0.26647687960071725, metrics={'train_runtime': 201.7727, 'train_samples_per_second': 18.699, 'train_steps_per_second': 1.18, 'train_loss': 0.26647687960071725, 'epoch': 7.0})

In [1]:
import numpy as np
import torch

a = torch.tensor([1.0, 2.0])
print(a.numpy())   # This MUST work


[1. 2.]


In [17]:
import numpy
# Evaluate model
trainer.evaluate()

100%|██████████| 4/4 [00:00<00:00,  8.29it/s]


{'eval_loss': 0.22301819920539856,
 'eval_overall_precision': 0.6885245901639344,
 'eval_overall_recall': 0.7368421052631579,
 'eval_overall_f1': 0.7118644067796609,
 'eval_overall_accuracy': 0.9341500765696784,
 'eval_Artist_f1': 0.7466666666666667,
 'eval_WoA_f1': 0.6511627906976744,
 'eval_runtime': 0.6879,
 'eval_samples_per_second': 87.224,
 'eval_steps_per_second': 5.815,
 'epoch': 7.0}

In [18]:
# Save model
trainer.save_model("../models/bert_fine_tuned")

In [19]:
# Use model
model = AutoModelForTokenClassification.from_pretrained("../models/bert_fine_tuned")
tokenizer = AutoTokenizer.from_pretrained("../models/bert_fine_tuned")

In [20]:
text = "music similar to morphine robocobra quartet | featuring elements like saxophone prominent bass"
from transformers import pipeline
pipe = pipeline(task="token-classification", model=model.to("cpu"), tokenizer=tokenizer, aggregation_strategy="simple")
pipe(text)
# tag_mapping = {"O": 0, "B-Artist": 1, "I-Artist": 2, "B-WoA": 3, "I-WoA": 4}

[{'entity_group': 'LABEL_0',
  'score': 0.9992402,
  'word': 'music similar to',
  'start': 0,
  'end': 16},
 {'entity_group': 'LABEL_1',
  'score': 0.95913875,
  'word': 'morphine robocobra',
  'start': 17,
  'end': 35},
 {'entity_group': 'LABEL_2',
  'score': 0.5578944,
  'word': 'quartet',
  'start': 36,
  'end': 43},
 {'entity_group': 'LABEL_0',
  'score': 0.99895257,
  'word': '| featuring elements like saxophone prominent bass',
  'start': 44,
  'end': 94}]