# DaTacos

In [48]:
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

B_ARTIST = "B-Artist"
B_WOA = "B-WOA"

data = pd.read_parquet("../data/datacos_biotag.parquet").drop("ver_id", axis=1)

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: "B-Artist" in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_noartist.csv", sep=";")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: "B-WoA" in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_notitle.csv", sep=";")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.07
Videos without matched Title entity: 0.07


### Multiple artists, usually separated by "feat", "featuring", "vs"
- omitting feat. artist
    - eg. "dub spencer & trance hill feat the catch" vs. "dub spencer & trance hill"
- artist and feat. artist split by song title
    - eg. "theo croker SONG_TITLE feat dee dee bridgewater" Vs. "theo croker featuring dee dee bridgewater"
- both artists spearatly mentioned for "sugababes vs girls aloud"	
### Unfuzzy-Problem
Are these problems solvable by edit-distance-like approach?
- String + digit concatenated vs. not concatenated
    - eg. "equipe84" vs. "equipe 84"
    - eg. "06diphtheria" instead of "diphtheria"
- genetive in artist name
    - eg. "janice whales" instead of "janice whaley"
- absence of article "the"
    - eg. "cats" instead of "the cats"
- which pattern is this actually? encoding? wrong preprocessing?:
    - "robin\nfi?" in SHS metadata?
    - "all saints\ngb" ..
    - "sabrina\nph" ...
- wrong spacing
    - "di do" instead of "dido"
    - "vaughn deleath"	instead of "vaughn de leath"	
- missing single chars 
    - between tokens
        - eg "george wallace jr" instead of "george c wallace jr"
    - in tokens
        - eg "ariane moffat" instead of "ariane moffatt"
- typo
    - "rosanna eckert" instead of "rosana eckert"
- alternative name 
    - "european ensemble string quartett" instead of "european ensemble strings"
- "&" vs. "and" (eg. "jim and jesse" vs "jim & jesse")


In [49]:
data_noartist.sample(10)[["yt_processed", "performer_perf_processed"]]


Unnamed: 0,yt_processed,performer_perf_processed
6363,johnny williams orchestra - let's do it. ifcop...,[johnny williams his orchestra]
3964,frankie and johnnie-frank crumit. geofbrit59. ...,[frank crumit]
11156,i wish i could shimmy like my sister kate. liz...,[lizzie miles]
6682,show boat - 1936. peter marshhmallow. look at ...,"[helen morgan, hattie mcdaniel, paul robeson, ..."
2417,i'm in the mood for love by nat king cole w/ l...,[king cole trio]
1054,jan howard-bridge over troubled waters. myjust...,[jan howard]
3838,larry adler blues in the night. vfouli60. pho...,[larry adler quartet]
2929,gone with the wind. dave brubeck - topic. prov...,"[the dave brubeck quartet, paul desmond, dave ..."
13508,monomen - over the edge. teajay69. winners or ...,[mono men]
7899,coleman hawkins - it's the talk of the town (m...,[coleman hawkins' orchestra]


In [50]:
data_notitle.sample(10)[["yt_processed", "title_perf_processed"]]


Unnamed: 0,yt_processed,title_perf_processed
4483,"benny goodman:- ""sometimes when i'm happy"" (19...",[sometimes i'm happy]
13477,"willie nelson - you left a long, long time ago...","[you left me a long, long time ago]"
2957,chad everett - ain t no sunshine. soul strut.,[ain't no sunshine]
5188,mae west - daytripper. matthew. from her 1966 ...,[day tripper]
10247,dickey lee - song sang blue. dvdman49. dickey ...,[song sung blue]
11568,frankie yankovic & his yanks-hava nagila. edif...,[hava nagila]
3615,the oh hellos fca mvmt iv - every bell on eart...,"[mvmt iv, ""every bell on earth will ring""]"
2303,helen forrest with benny goodman-how high the ...,[how high the moon]
4884,shirley bassey....who can i turn to?. cindb48....,[who can i turn to?]
7047,souls le ciel de paris - george melachrino. ne...,[sous le ciel de paris]


# SHS100K2

In [51]:
data = pd.read_parquet("../data/shs100k2_biotag.parquet")

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: "B-Artist" in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_noartist.csv", sep=";")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: "B-WoA" in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_csv("../data/datacos_notitle.csv", sep=";")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.09
Videos without matched Title entity: 0.12


In [63]:
data_notitle.sample(10)[["yt_processed", "title_processed"]]


Unnamed: 0,yt_processed,title_processed
46537,riot rockers be bop a lula. rockinsylvain.,[be-bop-a-lula]
106311,pooh-quello che non sai.flv. king47no. registr...,[quello che non sai]
104404,hazy shade of winter. john connors. anita kerr...,[a hazy shade of winter]
79170,the higsons - music to watch boys by. rubentxo...,[music to watch girls by]
45233,eddie higgins trio - you mast believe in sprin...,[you must believe in spring]
36805,polkadots & moonbeams - cassandra wilson. roos...,[polka dots and moon beams]
5498,one tin soldler. lararicki jack. skeeter davis,[one tin soldier]
61824,[ hifi ] will the circle be unbroken vol.2／ni...,[life's railway to heaven]
107937,heidi hauge - crystal candeliers. msanne5. hei...,[crystal chandeliers]
97871,status quo old time rock`n roll.. maius95. sta...,[old time rock and roll]
