# DaTacos

In [1]:
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

B_ARTIST = "B-Artist"
B_WOA = "B-WoA"

Artist_nan = "Artist_nan"
WoA_nan = "WoA_nan"
both_nan = "both_nan"

data = pd.read_parquet("../data/intermediate/datacos_IOB.parquet").drop("ver_id", axis=1)

print(f"Videos without matched Artist entity: {round(len(data[data.part == Artist_nan])/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data[data.part == WoA_nan])/len(data), 2)}")
print(f"Videos without any entity: {round(len(data[data.part == both_nan])/len(data), 2)}")


Videos without matched Artist entity: 0.04
Videos without matched Title entity: 0.03
Videos without any entity: 0.0


In [2]:
grouped = data.groupby(['part', 'split', 'set_id']).size().reset_index(name='i')
sizes = grouped.groupby(['part', 'split', 'i']).size().reset_index(name='num_set_ids')
pivoted = sizes.pivot_table(index=['part', 'split'], columns='i', values='num_set_ids', fill_value=0).reset_index()
pivoted


i,part,split,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Artist_nan,TEST,343.0,74.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,WoA_nan,TEST,188.0,37.0,16.0,4.0,2.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,both_100,TEST,1532.0,1.0,4.0,8.0,11.0,25.0,40.0,69.0,129.0,194.0,239.0,192.0,86.0
3,medium,TEST,410.0,123.0,43.0,24.0,9.0,7.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0


### Multiple artists, usually separated by "feat", "featuring", "vs"
- omitting feat. artist
    - eg. "dub spencer & trance hill feat the catch" vs. "dub spencer & trance hill"
- artist and feat. artist split by song title
    - eg. "theo croker SONG_TITLE feat dee dee bridgewater" Vs. "theo croker featuring dee dee bridgewater"
- both artists spearatly mentioned for "sugababes vs girls aloud"	
### Unfuzzy-Problem
Are these problems solvable by edit-distance-like approach?
- String + digit concatenated vs. not concatenated
    - eg. "equipe84" vs. "equipe 84"
    - eg. "06diphtheria" instead of "diphtheria"
- genetive in artist name
    - eg. "janice whales" instead of "janice whaley"
- absence of article "the"
    - eg. "cats" instead of "the cats"
- which pattern is this actually? encoding? wrong preprocessing?:
    - "robin\nfi?" in SHS metadata?
    - "all saints\ngb" ..
    - "sabrina\nph" ...
- wrong spacing
    - "di do" instead of "dido"
    - "vaughn deleath"	instead of "vaughn de leath"	
- missing single chars 
    - between tokens
        - eg "george wallace jr" instead of "george c wallace jr"
    - in tokens
        - eg "ariane moffat" instead of "ariane moffatt"
- typo
    - "rosanna eckert" instead of "rosana eckert"
- alternative name 
    - "european ensemble string quartett" instead of "european ensemble strings"
- "&" vs. "and" (eg. "jim and jesse" vs "jim & jesse")


In [3]:
data[data.part == Artist_nan].sample(10)[["yt_processed", "performer_perf_processed"]]


Unnamed: 0,yt_processed,performer_perf_processed
475,the string quartet - no surprises (radiohead c...,"[the section vitamin string quartet, section v..."
8632,adam baldych & helge lien - teardrop. pepital ...,"[adam badych, helge lien trio]"
8406,louisiana fairy tale by the casini club orche...,[casani club band directed by charlie kunz]
10191,royal philharmonic orchestra-take a look at me...,[the royal philharmonic orchestra conducted by...
10924,"yesterday once more. lovelyluz. ""d tc4ever""",[carpenters]
11725,trolly song. wb5oxq. the trolly song,"[marty gold, orchestra]"
7316,bennie krueger i cried for you roaring 20's.mp...,"[benny krueger, orchestra]"
655,over the rainbow- erroll garner trio. hotshiaw...,[errol garner]
4977,mighty flea - ode to billy joe. phill most.,"[the mighty flea featured, johnny otis show, m..."
6814,two sleepy people. jørgen ingmann - topic. pro...,[jrgen ingmann]


In [4]:
data[data.part == WoA_nan].sample(10)[["yt_processed", "title_perf_processed"]]


Unnamed: 0,yt_processed,title_perf_processed
10806,"run c & w - signed, sealed, delivered (i'm yo...","[signed, sealed, delivered i'm yours]"
4551,nat shilkret and the victor orchestra “am i bl...,[am i blue?]
4847,musica libre the spectrum ob la di ob la da 19...,"[ob-la-di, ob-la-da]"
13398,liberace dances - the blue danube - the libera...,[the blue danube waltz]
6273,"helen reddy - hit the road, jack. soultube. gr...",[hit the road jack]
10792,gene mcdaniels - feel like making love. colin ...,[feel like makin' love]
10138,the jam - i feel good - the legendary soul ses...,[i got you]
6852,"savannah churchill (& grp.) - don't cry, darli...",[don't take your love from me]
7250,jackie trent - reach out (i'll be there). fran...,[reach out i'll be there]
9366,slim whitman - no other arms. david l. rogers....,"[no other arms, no other lips]"


# SHS100K2

In [5]:
data = pd.read_parquet("../data/intermediate/shs100k2_biotag.parquet")

print(f"Videos without matched Artist entity: {round(len(data[data.part == Artist_nan])/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data[data.part == WoA_nan])/len(data), 2)}")
print(f"Videos without any entity: {round(len(data[data.part == both_nan])/len(data), 2)}")


grouped = data.groupby(['part', 'split', 'set_id']).size().reset_index(name='i')
sizes = grouped.groupby(['part', 'split', 'i']).size().reset_index(name='num_set_ids')
pivoted = sizes.pivot_table(index=['part', 'split'], columns='i', values='num_set_ids', fill_value=0).reset_index()
pivoted



Videos without matched Artist entity: 0.05
Videos without matched Title entity: 0.05
Videos without any entity: 0.0


i,part,split,1,2,3,4,5,6,7,8,...,109,111,122,124,127,138,148,171,184,256
0,Artist_nan,TEST,289.0,41.0,9.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Artist_nan,TRAIN,1456.0,486.0,150.0,64.0,20.0,9.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Artist_nan,VAL,336.0,47.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,WoA_nan,TEST,220.0,36.0,13.0,3.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,WoA_nan,TRAIN,1076.0,392.0,148.0,73.0,44.0,30.0,12.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,WoA_nan,VAL,284.0,48.0,10.0,3.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,both_100,TEST,200.0,358.0,333.0,262.0,171.0,113.0,55.0,33.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,both_100,TRAIN,133.0,300.0,370.0,357.0,505.0,425.0,416.0,359.0,...,1.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
8,both_100,VAL,183.0,304.0,343.0,309.0,230.0,167.0,108.0,61.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,medium,TEST,339.0,93.0,31.0,13.0,4.0,4.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
data[data.part == Artist_nan].sample(10)[["yt_processed", "performer_processed"]]


Unnamed: 0,yt_processed,performer_processed
15474,donald bryant - i like it like that (1964). jo...,[don bryant]
83434,bomull i øra vazelina bilopphøggers. skytebas....,[vazelina bilopphggers]
9755,two more bottles of wine. mrbigsvideos. i want...,[emmylou harris]
75949,rarezas rock 11. lacopla2008.,"[rocky sharpe, the replays, replays]"
59643,the survivors - peace in the valley. cashfan.,"[johnny cash, jerry lee lewis, carl perkins]"
107678,"morse, portnoy, george - i saw the light. пан...",[morse portnoy george]
5198,thee headcoatees - the witch. thegaragebitch.,[thee headcoat sect]
9770,пелагея (pelagea) - home (depeche mode cover)....,[]
1637,siniestro total - siniestro total (playback tv...,[siniestro total]
79412,hair - the flesh failures (let the sunshine in...,"[james rado, lynn kellogg, melba moore, company]"


In [7]:
data[data.part == WoA_nan].sample(10)[["yt_processed", "title_processed"]]


Unnamed: 0,yt_processed,title_processed
53781,kingston trio - sloop john b. marcelo san migu...,"[wreck of the ""john b""]"
7246,beseech - highwayman ( official video ). despo...,[the highwayman]
92282,"the ""chirping"" crickets /// 10. send me some l...",[send me some lovin']
79618,tethered moon - 8. the bilbao song. bruce brow...,[bilbao-song]
66107,jasmine bonnin -ndr-musikausstudiob vom 28.4.7...,[straen unserer stadt]
59801,roger whittaker - the lion sleeps tonight (liv...,[wimoweh]
21079,rick derringer - pride & joy. slavko polic.,[pride and joy]
9992,aiden - die die my darling. astaeria. lyrics: ...,"[die, die my darling]"
47188,"fénix jazz band - ""bill bailey vuelve a casa p...","[won't you please come home, bill bailey?]"
75163,paolo nutini - wanna be like you - slideshow. ...,[i want to be like you]


### Check why an entity is not found
Debugging and understanding the partial matching for dataset creation

In [9]:
from rapidfuzz.fuzz import partial_ratio_alignment
import sys
sys.path.append("../preprocessing")
import importlib.util
from Utils import simplify_string

# Import the module
module_path = '../preprocessing/2_make_IOB_dataset.py'
module_name = '2_make_IOB_dataset'
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)


item = data.loc[47653]
text = simplify_string(item.yt_processed)
title = simplify_string(item.title[0].lower())
performer = simplify_string(item.performer[0].lower())

partial_ratio_alignment(text, title)

module.find_word_partial(text, title)


((-1, -1), None)

# Annotation Dataset Creation
Stratified sampling:
- Split: Test, Train, Val
- Missing attribute: WoA, Artist
