# DaTacos

In [1]:
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

B_ARTIST = "B-Artist"
B_WOA = "B-WOA"

data = pd.read_parquet("../data/datacos_biotag.parquet").drop("ver_id", axis=1)

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: "B-Artist" in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_noartist.csv", sep=";")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: "B-WoA" in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_notitle.csv", sep=";")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.1
Videos without matched Title entity: 0.07


### Multiple artists, usually separated by "feat", "featuring", "vs"
- omitting feat. artist
    - eg. "dub spencer & trance hill feat the catch" vs. "dub spencer & trance hill"
- artist and feat. artist split by song title
    - eg. "theo croker SONG_TITLE feat dee dee bridgewater" Vs. "theo croker featuring dee dee bridgewater"
- both artists spearatly mentioned for "sugababes vs girls aloud"	
### Unfuzzy-Problem
Are these problems solvable by edit-distance-like approach?
- String + digit concatenated vs. not concatenated
    - eg. "equipe84" vs. "equipe 84"
    - eg. "06diphtheria" instead of "diphtheria"
- genetive in artist name
    - eg. "janice whales" instead of "janice whaley"
- absence of article "the"
    - eg. "cats" instead of "the cats"
- which pattern is this actually? encoding? wrong preprocessing?:
    - "robin\nfi?" in SHS metadata?
    - "all saints\ngb" ..
    - "sabrina\nph" ...
- wrong spacing
    - "di do" instead of "dido"
    - "vaughn deleath"	instead of "vaughn de leath"	
- missing single chars 
    - between tokens
        - eg "george wallace jr" instead of "george c wallace jr"
    - in tokens
        - eg "ariane moffat" instead of "ariane moffatt"
- typo
    - "rosanna eckert" instead of "rosana eckert"
- alternative name 
    - "european ensemble string quartett" instead of "european ensemble strings"
- "&" vs. "and" (eg. "jim and jesse" vs "jim & jesse")


In [2]:
data_noartist.sample(10)[["yt_processed", "performer_perf_processed"]]


Unnamed: 0,yt_processed,performer_perf_processed
5680,barbra streisand & louis armstrong hello dolly...,[barbra streisand with louis armstrong]
11637,kashmir instrumental bluegrass version led zep...,[iron horse]
4611,art farmer blame it on my youth 1988 chris hua...,[the art farmer quartet]
13623,3sl touch me tease me top of the pops luis paz...,"[, estelle]"
2947,george lewis just a closer walk konrad klingel...,[george lewis ragtime band]
2052,i didnt know what time it was bea wain 1939 mr...,[bea wain with orchestra under the direction o...
4561,goodbye chris connor topic provided to youtube...,[chris connor accompanied by vinnie burke quar...
5767,bill bailey wont you please come home by pearl...,[pearl bailey with orchestra conducted by don ...
11332,john coltrane chim chim cheree quintupla from ...,[the john coltrane quartet]
6051,you took advantage of me lee wiley topic provi...,[miss lee wiley with joe bushkins orchestra]


In [3]:
data_notitle.sample(10)[["yt_processed", "title_perf_processed"]]


Unnamed: 0,yt_processed,title_perf_processed
4117,meditacao nara leao+letra basiadanny quem acre...,meditação
2791,fool on the hill sergio mendes & brasil 66 har...,the fool on the hill
10132,james brown i feel good you tube james brown i...,i got you\ni feel good
8384,flyin easy debbie duncan debbieduncanjazz vide...,flying easy
1985,the joe mooney quartet nakajaiz the joe mooney...,tea for two
6039,margie joseph lets stay togetherwmv superxavie...,lets stay together
8515,brownie mcghee pawn shop blues edith de ronde ...,pawnshop blues
11974,me & mrsjones 1994 daryl hall konekoxox glasgow,me and mrs jones
11319,jane russell joshua fit de battle of jericho m...,joshua fit de battle of jerico
7926,bobby timmons do you know the way to san josew...,do you know the way to san jose


# SHS100K2

In [4]:
data = pd.read_parquet("../data/shs100k2_biotag.parquet")

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: "B-Artist" in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_noartist.csv", sep=";")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: "B-WoA" in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_notitle.csv", sep=";")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.22
Videos without matched Title entity: 0.11


In [5]:
data_noartist.sample(10)[["yt_processed", "performer_processed"]]


Unnamed: 0,yt_processed,performer_processed
12613,if i should fall behind springsteen cover lind...,robin & linda williams
55989,12th street rag by sol hoopiis trio 1927 cdbpd...,sol hoopiis novelty trio
67532,anita kerr singers everything must change 1979...,anita kerr harry van hoof pieter van vollenhoven
103389,elmers tune horst jankowski 1966 maynardcat be...,horst jankowski his piano and orchestra
83564,sammy davis jr with buddy rich come back to me...,sammy davis jr & buddy rich
18469,get a job corvairs norwiner a pittsburgh favorite,the corvairs
92244,mose allison i love the life i live music lege...,mose allison trio
58528,the melachrino orchestra and george melachrino...,the melachrino strings
18657,big 3 cass elliot winken blinken nod mamas and...,the big three
26310,nicoletta nossouvenirs quatre chansons il est ...,nicoletta orchestre sous la direction de jean ...


In [6]:
data_notitle.sample(10)[["yt_processed", "title_processed"]]


Unnamed: 0,yt_processed,title_processed
11444,lolita seeman deine heimat ist das meer swedto...,seemann\ndeine heimat ist das meer
59235,anne murray a fool such as i asiong,a fool such as i\nnow and then
39100,黎明 leon cant take my eyes off umpg onyi1213,cant take my eyes off u
25586,caetano veloso eleanor rigby nega maluca billi...,nega maluca billy jean eleanor rigby
72338,cab calloway & his orchestra minnie the mooche...,minnie the moocher\nthe ho de ho song
74758,john keating solitairewmv mrpopilit http bboyb...,solitaire
105144,hank crawford dont cry babyavi sigmundgroid fr...,dont cry baby
54600,swanie river hop fats domino mitcheureka783 sw...,swanee river hop
20214,rory block preachin blues nicolas fournier ror...,preaching blues
66274,phil trigwell it doesn t matter anymore yeaaas...,it doesnt matter anymore
