In [1]:
%run -i "../util/lang_utils.ipynb"

In [2]:
import pandas as pd
from spacy.cli.train import train
from spacy.cli.evaluate import evaluate
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [53]:
# Music NER
# https://github.com/deezer/music-ner-eacl2023

In [3]:
music_ner_df = pd.read_csv('../data/music_ner.csv')
print(music_ner_df)

        id                                               text  start_offset  \
0    13434  i love radioheads kid a something similar | ki...             7   
1    13434  i love radioheads kid a something similar | ki...            61   
2    13435                anything similar to i fight dragons            20   
3    13436                music similar to ccrs travelin band            17   
4    13437                 songs similar to blackout by boris            17   
..     ...                                                ...           ...   
422  14028  songs like good news by mac miller | preferrab...            11   
423  14028  songs like good news by mac miller | preferrab...            24   
424  14030  something along the lines of either the chain ...            49   
425  14030  something along the lines of either the chain ...            29   
426  14032       heavy bass x gothic rap like oxygen by bones            29   

     end_offset                  label  
0         

In [4]:
# Change labels to Artist, Artist_or_WoA or WoA
def change_label(input_label):
    input_label = input_label.replace("_deduced", "")
    return input_label

music_ner_df["label"] = music_ner_df["label"].apply(change_label)
print(music_ner_df)

        id                                               text  start_offset  \
0    13434  i love radioheads kid a something similar | ki...             7   
1    13434  i love radioheads kid a something similar | ki...            61   
2    13435                anything similar to i fight dragons            20   
3    13436                music similar to ccrs travelin band            17   
4    13437                 songs similar to blackout by boris            17   
..     ...                                                ...           ...   
422  14028  songs like good news by mac miller | preferrab...            11   
423  14028  songs like good news by mac miller | preferrab...            24   
424  14030  something along the lines of either the chain ...            49   
425  14030  something along the lines of either the chain ...            29   
426  14032       heavy bass x gothic rap like oxygen by bones            29   

     end_offset          label  
0            17   

In [5]:
train_db = DocBin()
test_db = DocBin()

In [6]:
# Get a unique list of unique ids
ids = list(set(music_ner_df["id"].values))
print(len(ids))
# Split ids into training and test
train_ids, test_ids = train_test_split(ids)
print(len(train_ids))
print(len(test_ids))

226
169
57


In [7]:
# Go through the list of ids and get all the rows associated with each id
for id in ids:
    entity_rows = music_ner_df.loc[music_ner_df['id'] == id]
    text = entity_rows.head(1)["text"].values[0]
    doc = small_model(text)
    ents = []
    for index, row in entity_rows.iterrows():
        label = row["label"]
        start = row["start_offset"]
        end = row["end_offset"]
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        ents.append(span)
    doc.ents = ents
    if id in train_ids:
        train_db.add(doc)
    else:
        test_db.add(doc)
train_db.to_disk('../data/music_ner_train.spacy')
test_db.to_disk('../data/music_ner_test.spacy')

In [8]:
# Train the model
train("../data/spacy_config_ner.cfg", output_path="../models/spacy_music_ner")

[38;5;2m✔ Created output directory: ..\models\spacy_music_ner[0m
[38;5;4mℹ Saving to output directory: ..\models\spacy_music_ner[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'tagger', 'parser', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TAGGER  LOSS PARSER  LOSS NER  TAG_ACC  POS_ACC  DEP_UAS  DEP_LAS  SENTS_F  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  -----------  -----------  --------  -------  -------  -------  -------  -------  ------  ------  ------  ------
  0       0          0.00        80.84       255.69     59.77    37.37     0.00    10.32    10.32     0.00    0.00    0.00    0.00    0.16
 11     200        639.54      2773.81      9576.97   4370.71    74.73     0.00    69.93    58.72    77.78   31.41   35.29   28.30    0.57
 24     400        632.67       228.33      1971.90    678.64    77.22     0.00    71.00    58.01    64.12   32.46   36.47   29.25  

In [9]:
# Use the trained model for prediction
nlp = spacy.load("../models/spacy_music_ner/model-last")
first_test_id = test_ids[0]
test_rows = music_ner_df.loc[music_ner_df['id'] == first_test_id]
input_text = entity_rows.head(1)["text"].values[0]
print(input_text)
print("Gold entities:")
for index, row in entity_rows.iterrows():
    label = row["label"]
    start = row["start_offset"]
    end = row["end_offset"]
    span = doc.char_span(start, end, label=label, alignment_mode="contract")
    print(span)
doc = nlp(input_text)
print("Predicted entities: ")
for entity in doc.ents:
    print(entity)

songs with themes of being unable to settle | ex hoziers someone new elle kings exes and ohs
Gold entities:
hoziers
someone new
elle kings
exes and ohs
Predicted entities: 
hoziers
someone new
elle kings
exes and ohs


In [10]:
# Evaluate the model
evaluate('../models/spacy_music_ner/model-last', '../data/music_ner_test.spacy')

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'tag_acc': 0.7864768683274022,
 'sents_p': 0.6857142857142857,
 'sents_r': 0.8275862068965517,
 'sents_f': 0.75,
 'dep_uas': 0.7081850533807831,
 'dep_las': 0.599644128113879,
 'dep_las_per_type': {'compound': {'p': 0.46153846153846156,
   'r': 0.6666666666666666,
   'f': 0.5454545454545455},
  'nsubj': {'p': 0.6470588235294118, 'r': 0.44, 'f': 0.5238095238095238},
  'advmod': {'p': 0.75, 'r': 0.3157894736842105, 'f': 0.44444444444444436},
  'root': {'p': 0.6285714285714286, 'r': 0.7586206896551724, 'f': 0.6875},
  'dobj': {'p': 0.2608695652173913,
   'r': 0.2608695652173913,
   'f': 0.2608695652173913},
  'advcl': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'dep': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'det': {'p': 0.7352941176470589, 'r': 0.78125, 'f': 0.7575757575757576},
  'amod': {'p': 0.6888888888888889,
   'r': 0.5166666666666667,
   'f': 0.5904761904761905},
  'prep': {'p': 0.8333333333333334,
   'r': 0.8235294117647058,
 