# DaTacos

In [41]:
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

B_ARTIST = "B-Artist"
B_WOA = "B-WOA"

data = pd.read_parquet("../data/datacos_biotag.parquet").drop("ver_id", axis=1)

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: "B-Artist" in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_noartist.csv", sep=";")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: "B-WoA" in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_notitle.csv", sep=";")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.22
Videos without matched Title entity: 0.07


### Multiple artists, usually separated by "feat", "featuring", "vs"
- omitting feat. artist
    - eg. "dub spencer & trance hill feat the catch" vs. "dub spencer & trance hill"
- artist and feat. artist split by song title
    - eg. "theo croker SONG_TITLE feat dee dee bridgewater" Vs. "theo croker featuring dee dee bridgewater"
- both artists spearatly mentioned for "sugababes vs girls aloud"	
### Unfuzzy-Problem
Are these problems solvable by edit-distance-like approach?
- String + digit concatenated vs. not concatenated
    - eg. "equipe84" vs. "equipe 84"
    - eg. "06diphtheria" instead of "diphtheria"
- genetive in artist name
    - eg. "janice whales" instead of "janice whaley"
- absence of article "the"
    - eg. "cats" instead of "the cats"
- which pattern is this actually? encoding? wrong preprocessing?:
    - "robin\nfi?" in SHS metadata?
    - "all saints\ngb" ..
    - "sabrina\nph" ...
- wrong spacing
    - "di do" instead of "dido"
    - "vaughn deleath"	instead of "vaughn de leath"	
- missing single chars 
    - between tokens
        - eg "george wallace jr" instead of "george c wallace jr"
    - in tokens
        - eg "ariane moffat" instead of "ariane moffatt"
- typo
    - "rosanna eckert" instead of "rosana eckert"
- alternative name 
    - "european ensemble string quartett" instead of "european ensemble strings"
- "&" vs. "and" (eg. "jim and jesse" vs "jim & jesse")


In [33]:
data_noartist.sample(10)[["yt_processed", "performer_perf_processed"]]


Unnamed: 0,yt_processed,performer_perf_processed
4593,sarah vaughan mean to me okmusix sheet music &...,sarah vaughan with dizzy gillespie and his orc...
6296,sergio brasil 66 going out of my head jukejunk...,sergio mendes & brasil 66
5759,detour ahead terence blanchard w jeanie bryson...,terence blanchard with jeanie bryson
7432,mantovani when i grow too old to dream hossam ...,mantovani and his orchestra
4392,george shearing september in the rain boo cox,the george shearing quintette
9229,17 super heroes original roxy cast musicofrhps...,abigale haness b miller & company
1246,billie holiday you go to my head jazzinsomnia ...,billie holiday & her orchestra
7823,paulinho garcia & grazyna auguscik fragile tha...,grażyna auguścik paulinho garcia
12366,andre previn russ freeman shelly manne take me...,andré previn & russ freeman
1176,dizzy gillespies quintet they cant take that a...,dizzy gillespie and his orchestra


In [34]:
data_notitle.sample(10)[["yt_processed", "title_perf_processed"]]


Unnamed: 0,yt_processed,title_perf_processed
7620,the dells wives & lovers theoriginaldells wive...,wives and lovers
13027,give me something to remember you nat brandwyn...,give me something to remember you by
11340,ella fitzgerald hard hearted hannah vladimirsm...,hard hearted hannah\nthe vamp of savannah
5908,hugo winterhalter and his orchestra with a slo...,on a slow boat to china
5660,louis armstrong i cant believe that you are in...,i cant believe that youre in love with me
5896,too marvellous for words lita roza topic provi...,too marvelous for words
8482,2016 samba em preludio rita payés joan chamorr...,samba em prelúdio
2979,pete rugolo & his jazz band everithing happens...,everything happens to me
4931,june christy baby all the time http wwwchaylzc...,when sunny gets blue
12230,ill drown in my own tears the spencer davis gr...,ill drown in my tears


# SHS100K2

In [43]:
data = pd.read_parquet("../data/shs100k2_biotag.parquet")

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: "B-Artist" in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_noartist.csv", sep=";")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: "B-WoA" in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_notitle.csv", sep=";")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.22
Videos without matched Title entity: 0.11


In [45]:
data_noartist.sample(10)[["yt_processed", "performer_processed"]]


Unnamed: 0,yt_processed,performer_processed
35064,moodys mood for love by van morrisonwmv mrmusi...,van morrison
42821,doris day well be together again 1945 overjazz...,les brown and his orchestra vocal chorus by do...
107099,duke ellington comme çi comme ça 1962 anthony ...,duke ellington and his orchestra
26866,kevin eubanks & stanley jordan nature boy guen...,kevin eubanks stanley jordan
80492,louis jordan & his tympani 5 im alabama bound ...,louis jordan and his tympany five
86429,midnight cowboy ronnie aldrich roberto costa d...,ronnie aldrich and his two pianos
41958,count basie april in paris jazzbreaktv count b...,count basie and his orchestra
30878,quincy jones mack the knife alex mason an awes...,quincy jones and his orchestra
57658,its the talk of the town 1945 bing crosby croo...,bing crosby and jimmy dorsey and his orchestra
46388,anthony perkins long ago and far away tonyperk...,tony perkins


In [46]:
data_notitle.sample(10)[["yt_processed", "title_processed"]]


Unnamed: 0,yt_processed,title_processed
82610,javier corcobado le poninçonneur des lilaswmv ...,le poinçonneur des lilas
36382,sonny james i cant stop loving you richard cus...,i cant stop lovin you
68532,mint juleps your love keeps liftin me acapella...,higher and higher
53619,aslan alone again by gilbert osullivan cover i...,alone again\nnaturally
37123,aint necessarily so walter murphy 1979 moraito...,it aint necessarily so
4547,gregorian carpet crawlers genesis gregorianmas...,the carpet crawlers
71026,cher how can u mend a broken heart 2 moviewmv ...,how can you mend a broken heart
97216,ed ames thing called love with the lennon sist...,a thing called love
65928,john hammond i m leavin you cody jarett cheste...,im a man
52374,holly cole que sera sera funnytoo tootoofunny ...,que sera sera\nwhatever will be will be
