# Preprocessing to one SHS100k2 parquet file

In [27]:
import pandas as pd

shs100k2_test = pd.read_csv("../data/shs100k2_test.csv", sep=";")
shs100k2_train = pd.read_csv("../data/shs100k2_train.csv", sep=";")
shs100k2_val = pd.read_csv("../data/shs100k2_val.csv", sep=";")
shs100k2_train["split"] = "TRAIN"
shs100k2_val["split"] = "VAL"
shs100k2_test["split"] = "TEST"
shs100k2 = pd.concat([shs100k2_test, shs100k2_val, shs100k2_train], axis=0, 
          ignore_index=True)

yt_metadata = pd.read_parquet("../data/yt_metadata.parquet")
yt_metadata = yt_metadata.rename({"title": "video_title"}, axis=1)

pd.merge(shs100k2, yt_metadata.reset_index(), how="left", on="yt_id").dropna(
    subset=["video_title"]
).drop(["link"], axis=1).to_parquet("/home/hachmeier/data/shs100k2_yt.parquet")




# Preprocess to abt-buy-like Format

### 1. Table file

In [32]:
shs100k2 = pd.read_parquet("/home/hachmeier/data/shs100k2_yt.parquet")

rel_cols = ["yt_id", "video_title", "channel_name", "description"]

# table
shs100k2[rel_cols].to_parquet(
    "/home/hachmeier/contrastive-product-matching/data/raw/shs100k2_yt/table.parquet"
                          )


Unnamed: 0,yt_id,video_title,channel_name,description
0,gORyrU1xQpg,Marianne Faithfull - Yesterday (with lyrics),FlierWithoutWings,"""Yesterday"" as beautifully sung by Marianne Fa..."
1,jQhC3bMMLmw,MATT MONRO - YESTERDAY,NANCYFLORESSANTOS,"""...all my troubles seemed\nso far away...""."
4,eI3-EVEU51s,Hollyridge Strings - Yesterday,boyjohn,Help support this channel by becoming a patron...
5,ew1y-gvO_NM,Cilla Black ::::: Yesterday.,SixtiesOnly,"""Yesterday"" is a song originally recorded by T..."
6,tjYQuEyxRp0,Sarah Vaughan - Yesterday,David Speed,"From 1981's ""Songs of the Beatles,"" (which sat..."
...,...,...,...,...
108515,2S3cauISpNw,The Sachal Ensemble - Give Me Love (Give Me Pe...,SachalEnsembleVEVO,Music video by The Sachal Ensemble performing ...
108516,Q6ix7lWPDAQ,Gary Stewart - Ain't Living Long Like This,steve fizzle,Gary Stewart - Ain't Living Long Like This
108518,0JP_ZipG5Z0,Rodney Crowell - I ain't living long like this,baalhabeit,"the original version,\n\nout of Rodney's Crowe..."
108520,j7MC3ckUaDA,Dream Syndicate - Ain't living Long Like This,Jay Dog,


### 2. Pair generation

In [79]:
def gen_pairs(data, n_pos=1500, n_neg=8500, n_for_cross=5000):

    sample = data[["yt_id", "set_id"]].sample(n_for_cross)
    cross = pd.merge(sample, sample, how="cross", suffixes=["_a", "_b"])

    pos_pairs = cross.query("(yt_id_a != yt_id_b) & (set_id_a == set_id_b)").sample(n_pos)
    neg_pairs = cross.query("(yt_id_a != yt_id_b) & (set_id_a != set_id_b)").sample(n_neg)
    
    pos_pairs["label"] = 1
    neg_pairs["label"] = 0
    
    dataset = pd.concat(
        [pos_pairs, neg_pairs], 
        ignore_index=True).sample(frac=1) #.drop(["set_id_a", "set_id_b"])
    
    return dataset

dataset = gen_pairs(shs100k2)



In [80]:
dataset

Unnamed: 0,yt_id_a,set_id_a,yt_id_b,set_id_b,label
4341,hfzOtbdrr10,1146,RJoO-GkeWDQ,940,0
2775,K-jvTipPL8A,3684,fbatSnN2Cn8,5067,0
941,ZC4qowOedfk,59,Mv9_OEgeG8Q,59,1
494,4gVClrDpyzw,214,kYnZsG7yq-U,214,1
9257,sfJNIeod4QU,70,PwfOdB8Dbgk,453,0
...,...,...,...,...,...
3144,7T3Shir-X50,8548,FxV_6Kp7PSw,412,0
2350,7K-WWlpawos,5327,5OAnWNWLM_Y,3320,0
8129,bl-8nuYSYzs,5033,3szo4Z2AZlw,2388,0
7248,Ha5lgrAUFrQ,49,j2WIUhibkVg,7507,0


Unnamed: 0,yt_id_a,set_id_a,yt_id_b,set_id_b
7822,MGxjIBEZvx0,2842,xcgCogM0_hg,2842
8696,MGxjIBEZvx0,2842,5oaG0jTC-R4,2842
11469,dCYzfkhE8_8,571,xFFPxvF00J8,571
11656,dCYzfkhE8_8,571,VdRvUn482Go,571
15400,QpW1pQ1-tMg,159,zKkrfsdGb2k,159
...,...,...,...,...
24984224,ZnekFd6_ABc,38,lIj8i54oMLA,38
24984939,ZnekFd6_ABc,38,satEWgJLGbQ,38
24992879,8WqwhEy1cK8,499,RP7QUY3cFjI,499
24993968,8WqwhEy1cK8,499,ol4O61bxzoA,499
