# DaTacos

In [12]:
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

B_ARTIST = "B-Artist"
B_WOA = "B-WoA"

data = pd.read_parquet("../data/intermediate/datacos_biotag.parquet").drop("ver_id", axis=1)

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: B_ARTIST in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/datacos_noartist.parquet")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: B_WOA in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/datacos_notitle.parquet")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.07
Videos without matched Title entity: 0.07


### Multiple artists, usually separated by "feat", "featuring", "vs"
- omitting feat. artist
    - eg. "dub spencer & trance hill feat the catch" vs. "dub spencer & trance hill"
- artist and feat. artist split by song title
    - eg. "theo croker SONG_TITLE feat dee dee bridgewater" Vs. "theo croker featuring dee dee bridgewater"
- both artists spearatly mentioned for "sugababes vs girls aloud"	
### Unfuzzy-Problem
Are these problems solvable by edit-distance-like approach?
- String + digit concatenated vs. not concatenated
    - eg. "equipe84" vs. "equipe 84"
    - eg. "06diphtheria" instead of "diphtheria"
- genetive in artist name
    - eg. "janice whales" instead of "janice whaley"
- absence of article "the"
    - eg. "cats" instead of "the cats"
- which pattern is this actually? encoding? wrong preprocessing?:
    - "robin\nfi?" in SHS metadata?
    - "all saints\ngb" ..
    - "sabrina\nph" ...
- wrong spacing
    - "di do" instead of "dido"
    - "vaughn deleath"	instead of "vaughn de leath"	
- missing single chars 
    - between tokens
        - eg "george wallace jr" instead of "george c wallace jr"
    - in tokens
        - eg "ariane moffat" instead of "ariane moffatt"
- typo
    - "rosanna eckert" instead of "rosana eckert"
- alternative name 
    - "european ensemble string quartett" instead of "european ensemble strings"
- "&" vs. "and" (eg. "jim and jesse" vs "jim & jesse")


In [13]:
data_noartist.sample(10)[["yt_processed", "performer_perf_processed"]]


Unnamed: 0,yt_processed,performer_perf_processed
1288,brian wilson-we wish you a merry christmas. ja...,[brian wilson]
4871,hank william sr - your cheatin heart lyrics. t...,"[hank williams, drifting cowboys]"
3394,hadda brooks-basin street blues modern records...,[hadda brooks trio]
12672,"the string quartet ""comfortably numb"" (pink fl...",[vitamin string quartet]
1468,the moonglows-secret love. carlos rasool. the ...,"[the moonglows, the red holloway orchestra, mo..."
1171,5roy hamilton -- speak low. vintfy.,[roy hamilton]
9146,hell's belles-rock 'n' roll damnation. she rox...,[hell's bells]
11597,petula clark - san francisco. brewerfan714. ex...,[pet clark]
9171,quentin - dark side (top 8). aayla secura. que...,[quentin alexander]
8765,the string quartet tribute to metallica - one....,"[the angry string orchestra, angry string orch..."


In [14]:
data_notitle.sample(10)[["yt_processed", "title_perf_processed"]]


Unnamed: 0,yt_processed,title_perf_processed
8953,"""lord build me just a cabin in the corner of g...","[lord, build me a cabin in glory]"
4344,green green grass of home--charley pride. elym...,"[green, green grass of home]"
6716,alone again....naturally by vonda shepard. ter...,"[alone again (naturally), alone again]"
13496,the cars-shake it up. ryder276.,[shake it up]
6210,"bing crosby - ""folks on the hill"" (vintage par...",[the folks who live on the hill]
2401,dave pell octet ft. lucy ann polk - polka dots...,[polka dots and moon beams]
5356,boots randolph-gentle on my mind. jim fox. cou...,[gentle on my mind]
5083,lee wiley - ghost of a chance. overjazz. lee w...,[a ghost of a chance]
10265,jackson 5-i'll be there. m m. i really love th...,[i'll be there]
9908,"james morrison with emma pask ""wouldn't it be ...",[wouldn't it be loverly]


# SHS100K2

In [15]:
data = pd.read_parquet("../data/intermediate/shs100k2_biotag.parquet")

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: B_ARTIST in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/shs100k2_noartist.parquet")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: B_WOA in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/shs100k2_notitle.parquet")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.09
Videos without matched Title entity: 0.12


In [16]:
data_notitle.sample(10)[["yt_processed", "title_processed"]]


Unnamed: 0,yt_processed,title_processed
100595,shakin stevens - sweet little rock'n roll. mur...,[sweet little rock and roller]
92084,blaggards- drunken sailor-feat spongebob squar...,[drunken sailor]
50859,disco gramola-tommy edwards-you win again. osv...,[you win again]
24410,ella fitzgerald - round midnight. tigi white. ...,['round midnite]
96244,"in the cool, cool of the evening - ray conniff...","[in the cool, cool, cool of the evening]"
47617,shani wallis...don't take your love from me. m...,[don't take your love from me]
29137,tammy wynette-you'll never walk alone. myjusti...,[you'll never walk alone]
60369,jazz pilots harry reser tom stacks - sleepy-ti...,[sleepy time gal]
108193,black eyed dog/free to run. gomez - topic. pro...,[black eyed dog / free to run]
55205,vieille canaille_reprise eddy mitchell/gainsbo...,[vieille fille]
