# DaTacos

In [1]:
import warnings
import pandas as pd
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

B_ARTIST = "B-Artist"
B_WOA = "B-WoA"

data = pd.read_parquet("../data/intermediate/datacos_biotag.parquet").drop("ver_id", axis=1)

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: B_ARTIST in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/datacos_noartist.parquet")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: B_WOA in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/datacos_notitle.parquet")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


AttributeError: 'DataFrame' object has no attribute 'NER_TAGS'

### Multiple artists, usually separated by "feat", "featuring", "vs"
- omitting feat. artist
    - eg. "dub spencer & trance hill feat the catch" vs. "dub spencer & trance hill"
- artist and feat. artist split by song title
    - eg. "theo croker SONG_TITLE feat dee dee bridgewater" Vs. "theo croker featuring dee dee bridgewater"
- both artists spearatly mentioned for "sugababes vs girls aloud"	
### Unfuzzy-Problem
Are these problems solvable by edit-distance-like approach?
- String + digit concatenated vs. not concatenated
    - eg. "equipe84" vs. "equipe 84"
    - eg. "06diphtheria" instead of "diphtheria"
- genetive in artist name
    - eg. "janice whales" instead of "janice whaley"
- absence of article "the"
    - eg. "cats" instead of "the cats"
- which pattern is this actually? encoding? wrong preprocessing?:
    - "robin\nfi?" in SHS metadata?
    - "all saints\ngb" ..
    - "sabrina\nph" ...
- wrong spacing
    - "di do" instead of "dido"
    - "vaughn deleath"	instead of "vaughn de leath"	
- missing single chars 
    - between tokens
        - eg "george wallace jr" instead of "george c wallace jr"
    - in tokens
        - eg "ariane moffat" instead of "ariane moffatt"
- typo
    - "rosanna eckert" instead of "rosana eckert"
- alternative name 
    - "european ensemble string quartett" instead of "european ensemble strings"
- "&" vs. "and" (eg. "jim and jesse" vs "jim & jesse")


In [2]:
data_noartist.sample(10)[["yt_processed", "performer_perf_processed"]]


Unnamed: 0,yt_processed,performer_perf_processed
6533,the old rugged cross - ray price. classiccount...,[ray price]
973,in a sentimental mood - billy eckstine (10/06/...,"[billy eckstine, his orch.]"
6469,dancing in the dark - jane morgan. yoichiro ta...,[jane morgan]
9469,livin' it up - ricky peterson. teimuraz67. 1990,[ricky peterson]
4595,mean to me big maybelle. max.,[big maybelle]
1256,smoke gets in your eyes (1933). thedidier568. ...,[gertrude niesen]
8886,валерия - дай бог. valeriyaofficial. валерия -...,[]
4133,you don't know me. finfer song. bobby goldsbor...,[bobby goldsboro]
4495,musica italia per l etiopia volare. radio cos...,[musicaitalia per l'etiopia]
1941,i only have eyes for you - swallows. jack stro...,"[the swallows, swallows]"


In [3]:
data_notitle.sample(10)[["yt_processed", "title_perf_processed"]]


Unnamed: 0,yt_processed,title_perf_processed
11745,michael jackson - thriller music video. theque...,[thriller]
6419,nukey pikes - dancing queen. supersummer7.,[dancing queen]
13430,mexican- helloween. francesco gissi. chico fer...,[the mexican]
53,running wild - firebreather. kuternoga.,[firebreather]
13139,gary lewis & the playboys - windy. hermansherm...,[windy]
8374,new york dolls - subway train. latacabre1. song,[subway train]
10046,osborne brothers - faded love. countryarg.,[faded love]
11568,frankie yankovic & his yanks-hava nagila. edif...,[hava nagila]
11249,joo kraus - tales in tones - africa. livekonze...,[africa]
6091,the deirdre wilson tabac get back (lennon - ...,[get back]


# SHS100K2

In [4]:
data = pd.read_parquet("../data/intermediate/shs100k2_biotag.parquet")

# data without artist and title tags each
data_noartist = data.loc[~data.NER_TAGS.apply(lambda x: B_ARTIST in x), :]
data_noartist.yt_processed = data_noartist.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_noartist.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/shs100k2_noartist.parquet")

data_notitle = data.loc[~data.NER_TAGS.apply(lambda x: B_WOA in x), :]
data_notitle.yt_processed = data_notitle.yt_processed.str.replace("\n", " ").str.replace("\t", " ")
data_notitle.drop(["NER_TAGS", "TEXT"], axis=1).to_parquet("../data/analysis/shs100k2_notitle.parquet")

print(f"Videos without matched Artist entity: {round(len(data_noartist)/len(data), 2)}")
print(f"Videos without matched Title entity: {round(len(data_notitle)/len(data), 2)}")


Videos without matched Artist entity: 0.05
Videos without matched Title entity: 0.03


In [5]:
data_notitle.sample(10)[["yt_processed", "title_processed"]]


Unnamed: 0,yt_processed,title_processed
44166,the beach boys - california dreaming. remy wen...,[california dreaming]
101575,spirit of memphis quartet - every time i feel ...,[every time i feel the spirit]
79128,bob wills and his texas playboys - blue prelud...,[blue prelude]
55680,yves montand - c'est si bon. fio re.,[c'est si bon]
4028,richard cheese-closer (nine inch nails cover)....,[closer]
93322,james cotton - the blues keep falling. elale g...,[the blues keep falling]
69409,ginny and the gallions - hava nagila. dj rodne...,[hava nagila]
50073,nina hagen - 1979 herrman's door (live). b dp.,[herman's door]
34121,megan mullally - st james infirmary. naum groz...,[st. james infirmary]
93435,vigon. dizzy miss lizzy. jean-charles. face b...,[dizzy miss lizzy]


In [6]:
from rapidfuzz.fuzz import partial_ratio_alignment

s1="the beach boys - california dreaming. remy wen..."
s2="california dreaming"

partial_ratio_alignment(s1, s2)

ScoreAlignment(score=100.0, src_start=17, src_end=36, dest_start=0, dest_end=19)

In [20]:
data_notitle.loc[44166, ["TEXT", "NER_TAGS"]]


TEXT        [the, beach, boys, -, california, dreaming., r...
NER_TAGS    [B-Artist, I-Artist, I-Artist, O, B-Artist, I-...
Name: 44166, dtype: object

In [13]:
data_notitle.NER_TAGS

63        [B-Artist, I-Artist, I-Artist, B-Artist, I-Art...
95           [B-Artist, I-Artist, O, B-Artist, I-Artist, O]
116       [B-Artist, I-Artist, O, B-Artist, I-Artist, I-...
132                                 [B-Artist, I-Artist, O]
139                [B-Artist, I-Artist, B-Artist, I-Artist]
                                ...                        
108379    [B-Artist, B-Artist, I-Artist, I-Artist, O, B-...
108381    [B-Artist, I-Artist, O, B-Artist, I-Artist, I-...
108451    [B-Artist, I-Artist, B-Artist, I-Artist, B-Art...
108463          [B-Artist, I-Artist, O, B-Artist, I-Artist]
108464    [B-Artist, I-Artist, O, B-Artist, I-Artist, I-...
Name: NER_TAGS, Length: 2572, dtype: object