In [1]:
from rapidfuzz import process, fuzz, utils
import pandas as pd
from tqdm import tqdm

shs100k = pd.read_parquet("/data/csi_datasets/shs100k2_yt.parquet")
datacos = pd.read_parquet("/data/csi_datasets/datacos_yt.parquet")
shs_yt = pd.read_parquet("/data/csi_datasets/shs_yt_yt.parquet")
shs100k_val = pd.read_csv("/data/csi_datasets/shs100k2_val.csv", sep=";")
shs100k_val.query("has_file & has_cqt_ch & has_cqt_20 & has_crema").sample(500).to_csv(
    "/data/csi_datasets/shs100k2_val500.csv", sep=";")


# Rapid Fuzz Random Example

In [13]:
random_video = shs100k.sample(n=1)
random_video.head()

song_title = random_video.title.item().lower()
video_title = random_video.video_title.item().lower()
video_description = random_video.description.item().lower()

#song_title = "dawn of time"
#video_title = "performed live: dawn of times in memphis"
#video_description = "the band ledije performs dawn of time in memphis in 2020"

# song title in video title
st_vt = fuzz.token_ratio(song_title, video_title)
vt_st = fuzz.token_ratio(video_title, song_title)
# song title in description
st_vd = fuzz.token_ratio(song_title, video_description)
vd_st = fuzz.token_ratio(video_description, song_title)

print(f"song title: {song_title}")
print(f"video title: {video_title}")
print(f"video description: {video_description}")

print(f"st-vt {st_vt}")
print(f"vt-st {vt_st}")
print(f"vd-st {st_vd}")
print(f"st-vd {vd_st}")


song title: georgy porgy
video title: (je suis funky je suis frenchy) dwight druik - georgy porgy (1980)
video description: http://www.discogs.com/various-je-suis-funky-je-suis-frenchy/release/701938
cover de toto : "georgy porgy".
st-vt 100.0
vt-st 100.0
vd-st 20.168067226890756
st-vd 20.168067226890756


'dawn of tim'

In [6]:
pd.read_json("results/shs100k2_val/funcs/results.json").sort_values(
    by="nDCG_bin", ascending=False
)


Unnamed: 0,scorer,eval,time,HR10,MR1,MRR,P@10,Queries,Relevant Items,mAP,nDCG_bin,nDCG_ord,rP
2,rapidfuzz.fuzz_cpp_avx2_token_ratio,title,4.216771,0.918059,111.346077,0.774036,0.358961,8819,44366,0.605646,0.749151,,0.599789
6,rapidfuzz.fuzz_cpp_avx2_token_set_ratio,title,3.10162,0.917945,111.470131,0.77385,0.358675,8819,44366,0.605208,0.748809,,0.599392
12,rapidfuzz.fuzz_cpp_avx2_partial_token_sort_ratio,title,4.703931,0.888991,114.613068,0.725146,0.318437,8819,44366,0.530141,0.700759,,0.502555
0,rapidfuzz.fuzz_cpp_avx2_ratio,title,0.040834,0.864042,126.407646,0.700352,0.261376,8819,44366,0.444718,0.639096,,0.433476
20,rapidfuzz.fuzz_cpp_avx2_QRatio,title,0.086903,0.864042,126.407646,0.700352,0.261376,8819,44366,0.444718,0.639096,,0.433476
1,rapidfuzz.fuzz_cpp_avx2_ratio,performer+title,0.144848,0.814145,141.477905,0.602377,0.254944,8819,44366,0.405877,0.595442,,0.407115
21,rapidfuzz.fuzz_cpp_avx2_QRatio,performer+title,0.176853,0.814145,141.477905,0.602377,0.254944,8819,44366,0.405877,0.595442,,0.407115
3,rapidfuzz.fuzz_cpp_avx2_token_ratio,performer+title,4.368339,0.863699,127.898605,0.467139,0.307359,8819,44366,0.419721,0.594974,,0.458668
10,rapidfuzz.fuzz_cpp_avx2_token_sort_ratio,title,0.16422,0.836347,135.515228,0.662889,0.230453,8819,44366,0.385018,0.594403,,0.379098
7,rapidfuzz.fuzz_cpp_avx2_token_set_ratio,performer+title,3.479983,0.861753,128.361298,0.466407,0.306649,8819,44366,0.418796,0.59413,,0.457552


# ChatGPT tries...gen NER dataset

In [40]:
import spacy

def create_ner_dataset(longer_sequence, shorter_sequence):
    # Load spaCy English model for tokenization
    nlp = spacy.load("en_core_web_sm")

    # Tokenize the longer and shorter sequences
    longer_tokens = [token.text for token in nlp(longer_sequence)]
    shorter_tokens = [token.text for token in nlp(shorter_sequence)]

    # Initialize the output dictionary with all tokens set to 0
    output_dict = {token: 0 for token in longer_tokens}

    # Find the start index of the overlapping region
    start_index = longer_tokens.index(shorter_tokens[0])

    # Mark the tokens in the overlapping region as 1
    for i in range(start_index, start_index + len(shorter_tokens)):
        output_dict[longer_tokens[i]] = 1

    return output_dict

# Example usage
longer_sequence = "This is a longer sequence with more tokens."
shorter_sequence = "longer sequence"
result = create_ner_dataset(longer_sequence, shorter_sequence)
print(result)


{'This': 0, 'is': 0, 'a': 0, 'longer': 1, 'sequence': 1, 'with': 0, 'more': 0, 'tokens': 0, '.': 0}


In [46]:
import pandas as pd
import spacy

def create_ner_dataset(row):
    # Tokenize the longer and shorter sequences
    longer_tokens = [(token.text, 'UNKN') for token in nlp(row['video_title'])]
    title_tokens = [(token.text, 'TITLE') for token in nlp(row['title'])]
    performer_tokens = [(token.text, 'PERFORMER') for token in nlp(row['performer'])]

    # Create sets of tokens for 'title' and 'performer'
    title_set = set(token for token, _ in title_tokens)
    performer_set = set(token for token, _ in performer_tokens)

    # Initialize the output list of tuples
    output_list = []

    # Populate the output list with tuples from 'title' and 'performer'
    for i, (token, _) in enumerate(longer_tokens):
        if token in title_set:
            output_list.append((i, token, 'TITLE'))
        elif token in performer_set:
            output_list.append((i, token, 'PERFORMER'))
        else:
            output_list.append((i, token, 'UNKN'))

    return output_list

# Example DataFrame
data = {'video_title': ["This is a longer sequence with more tokens.", "Another video title"],
        'title': ["longer sequence", "Another title"],
        'performer': ["more tokens", "Another performer"]}

df = pd.DataFrame(data)

# Load spaCy English model for tokenization
nlp = spacy.load("en_core_web_sm")

# Apply the create_ner_dataset function to each row of the DataFrame
result_list = df.apply(create_ner_dataset, axis=1).tolist()

# Print the list of lists of tuples
print(result_list)


[[(0, 'This', 'UNKN'), (1, 'is', 'UNKN'), (2, 'a', 'UNKN'), (3, 'longer', 'TITLE'), (4, 'sequence', 'TITLE'), (5, 'with', 'UNKN'), (6, 'more', 'PERFORMER'), (7, 'tokens', 'PERFORMER'), (8, '.', 'UNKN')], [(0, 'Another', 'TITLE'), (1, 'video', 'UNKN'), (2, 'title', 'TITLE')]]


In [1]:
def ner_row(row, col="title"):
    
    def encode(sub, sent):
        subwords, sentwords = sub.lower().split(), sent.lower().split()
        print(subwords)
        res = ["UNK" for _ in sentwords]    
        for i, word in enumerate(sentwords[:-len(subwords) + 1]):
            if all(x == y for x, y in zip(subwords, sentwords[i:i + len(subwords)])):
                for j in range(len(subwords)):
                    res[i + j] = col.upper()
        return res
    
    doc = row.video_title
    qry = row[col]
    
    seq_qry = encode(qry, doc)

    return seq_qry


df = shs100k.head()
df[["video_title", "title", "performer"]]


NameError: name 'shs100k' is not defined

In [80]:
df.apply(lambda x: ner_row(x, 'title'), axis=1)


['yesterday']
['yesterday']
['yesterday']
['yesterday']
['yesterday']


0    [UNK, UNK, UNK, UNK, UNK, UNK]
1              [UNK, UNK, UNK, UNK]
4              [UNK, UNK, UNK, UNK]
5              [UNK, UNK, UNK, UNK]
6              [UNK, UNK, UNK, UNK]
dtype: object

In [79]:
def encode(sub, sent):
    subwords, sentwords = sub.split(), sent.split()
    res = [0 for _ in sentwords]    
    for i, word in enumerate(sentwords[:-len(subwords) + 1]):
        if all(x == y for x, y in zip(subwords, sentwords[i:i + len(subwords)])):
            for j in range(len(subwords)):
                res[i + j] = 1
    return res

def encode2(sub, sent):
    subword, sentwords = sub, sent.split()
    res = [0] * len(sentwords)

    for i, word in enumerate(sentwords):
        if word == subword:
            res[i] = 1

    return res

encode2("yesterday i", "marianne faithful yesterday i")

[0, 0, 0, 0]

In [33]:
q="You're All I Need to Get By"
d="DIANA ROSS  you're all i need to get by"

fuzz.QRatio(q.lower(), d.lower())

81.81818181818181

In [44]:
import spacy

NER = spacy.load("en_core_web_sm")

doc = NER(d)
entities = [(e.label_,e.text) for e in doc.ents]



In [45]:
entities

[('PERSON', 'DIANA ROSS')]

In [6]:
# query (title, performer)
# yt attr (video title, channel name, description)
match_dict = {}

for query_column in ["title", "performer"]:
    
    print(f"qry col: {query_column}")
    match_dict[query_column] = {}
    
    for document_column in tqdm(["video_title", "description", "channel_name"]):
        
        print(f"doc col: {document_column}")

        match_dict[query_column][document_column] = {}
        
                        
        results = process.cdist(queries=shs100k[query_column].to_list(), 
                          choices=shs100k[document_column].to_list(), )
                        
        match_dict[query_column][document_column]["token_set_ratio"] = results
            


qry col: title
doc col: video_title


  0%|          | 0/11 [00:22<?, ?it/s]


KeyboardInterrupt: 

Unnamed: 0,title,video_title
30125,Manha de Carnaval,Mason Williams


In [None]:
shs100k2