## Calculate Content Similarity (Aug 14, 2024)

In this file, I incorporate sponsor description and video description, labels, and transcripts.
The similarity is calculated directly using the embedding vectors from google text embedding model with no topic model.

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
PATH = "/home/doosti@chapman.edu/projects/Facebook/top2vec/data/"
# import data
data = pd.read_csv(os.path.join(PATH, "vca_combined_2024-08-13.csv"))
print(f"Combined data size: {data.shape[0]}")
# sponsored videos
sponsored = pd.read_csv(os.path.join(PATH, "sponsored_videos_data.csv"))
print(f"Sponsored videos size: {sponsored.shape[0]}")
# sponsors
sponsors = pd.read_csv(os.path.join(PATH, "sponsor_description.csv"))
print(f"Sponsors size: {sponsors.shape[0]}")
with open(os.path.join(PATH, "embeddings.pkl"), "rb") as f:
    embeddings = pickle.load(f)
print(f"Embeddings size: {len(embeddings)}")

Combined data size: 38077
Sponsored videos size: 34028
Sponsors size: 4049
Embeddings size: 38077


In [17]:
sponsors['embeddings'] = embeddings[:sponsors.shape[0]]
sponsored['embeddings'] = embeddings[sponsors.shape[0]:]

In [36]:
# find the sponsor embedding for each sponsored video
sponsored['sponsor_embeddings'] = sponsored[['sponsor_name']].merge(sponsors, on='sponsor_name', how='left')['embeddings']
print(sponsored.isnull().sum())

video_id                  0
new_id                    0
creator_id                0
creator_name              0
sponsor_id             4414
sponsor_name             11
title_description         0
topics                20567
labels                24509
labels2               24509
transcript            27387
text                      0
embeddings                0
sponsor_embeddings     3824
dtype: int64


In [82]:
# function to find similar sponsors
def find_similar_sponsors(sponsor_name, n=5):
    sponsor_embedding = sponsors[sponsors['sponsor_name']==sponsor_name]['embeddings'].values[0]
    cosine_sim = cosine_similarity([sponsor_embedding], sponsors['embeddings'].tolist())
    similar_sponsors = sponsors.iloc[np.argsort(cosine_sim[0])[-n-1:-1]]['sponsor_name'].values
    return list(zip(similar_sponsors[::-1], np.sort(cosine_sim[0])[-n-1:-1][::-1]))

In [124]:
sponsor_sample = sponsors.sample(1)['sponsor_name'].values[0]
print(f"Sample sponsor: {sponsor_sample}")
for (sim_sponsor, score) in find_similar_sponsors(sponsor_sample, n=10):
    # print the score with 2 decimal points
    print(f"{sim_sponsor:30s} | {score:.2f}")


Sample sponsor: BLACK LABEL Bacon
Hormel Pepperoni               | 0.62
President's Choice             | 0.62
Johnsonville                   | 0.62
Melissa's Produce              | 0.61
Netflix                        | 0.61
Knorr                          | 0.61
Hillshire Farm                 | 0.61
Hormel Chili                   | 0.61
Heinz Ketchup                  | 0.61
Incredible Egg                 | 0.61


In [140]:
# function to find similar sponsors
def find_similar_videos(sponsor_name, n=5):
    sponsor_embedding = sponsors[sponsors['sponsor_name']==sponsor_name]['embeddings'].values[0]
    cosine_sim = cosine_similarity([sponsor_embedding], sponsored['embeddings'].tolist())
    similar_videos = sponsored.iloc[np.argsort(cosine_sim[0])[-n-1:-1]][['title_description','creator_name','sponsor_name']].values
    return list(zip(similar_videos[::-1], np.sort(cosine_sim[0])[-n-1:-1][::-1]))

sponsor_sample = sponsors.sample(1)['sponsor_name'].values[0]

print(f"Sample sponsor: {sponsor_sample}")
print("Similar videos:")
for (sim_vid, score) in find_similar_videos(sponsor_sample, n=5):
    print(f"{sim_vid[0][:30]:30s}..., {sim_vid[1]:30s}, {sim_vid[2][:30]:30} | {score:.2f}")

Sample sponsor: 9Health Fair
Similar videos:
Health Happens                ..., 9NEWS (KUSA)                  , 9Health Fair                   | 0.68
Vote for a Trainer I joined Tr..., Dr. Mehmet Oz                 , USANA Health Sciences Inc.     | 0.67
How One Trainer Uses The World..., PopSugar Fitness              , GMC                            | 0.67
Tips, tricks, and talking to t..., KSL 5 TV                      , Intermountain Live Well        | 0.66
250k Transformation Challenge ..., Bodybuilding.com              , Optimum Nutrition              | 0.65


In [150]:
# find sponsor for a sample video
def find_similar_sponsor(video_id, n=5):
    embedding = sponsored[sponsored['video_id']==video_id]['embeddings'].values[0]
    cosine_sim = cosine_similarity([embedding], sponsors['embeddings'].tolist())
    similar_sponsors = sponsors.iloc[np.argsort(cosine_sim[0])[-n-1:-1]][['sponsor_name']].values
    return list(zip(similar_sponsors[::-1], np.sort(cosine_sim[0])[-n-1:-1][::-1]))

video_sample = sponsored.sample(1)['video_id'].values[0]
creator_name = sponsored[sponsored['video_id']==video_sample]['creator_name'].values[0]
sponsor_name = sponsored[sponsored['video_id']==video_sample]['sponsor_name'].values[0]

print(f"Sample video: {video_sample}, {creator_name}, {sponsor_name}")
print("Similar sponsors:")
for (sim, score) in find_similar_sponsor(video_sample, n=5):
    print(f"{sim[0][:30]:30s}... | {score:.2f}")

Sample video: 10157488778824616, New York Knicks, Budweiser
Similar sponsors:
YES Network                   ... | 0.58
The Players' Tribune          ... | 0.57
U.S. Soccer                   ... | 0.57
Sparkling Ice                 ... | 0.57
Champs Sports                 ... | 0.57


In [118]:
# calculate cosine similarity between a document and a creator
def get_sponsorship_similarity(document_vector, creator_embedding):
    if type(creator_embedding) != np.ndarray:
        return np.nan
    return cosine_similarity([document_vector], [creator_embedding])[0][0]

In [119]:
from tqdm.auto import tqdm
tqdm.pandas()

sponsored['sponsorship_similarity'] = sponsored.progress_apply(lambda x: get_sponsorship_similarity(x['embeddings'], x['sponsor_embeddings']), axis=1).values

print(sponsored.sponsorship_similarity.describe())
print(sponsored.sponsorship_similarity.isnull().sum())

100%|██████████| 34028/34028 [00:19<00:00, 1738.68it/s]

count    30204.000000
mean         0.514502
std          0.096106
min          0.273755
25%          0.439255
50%          0.506337
75%          0.579599
max          0.941470
Name: sponsorship_similarity, dtype: float64
3824





In [121]:
sponsored.columns

Index(['video_id', 'new_id', 'creator_id', 'creator_name', 'sponsor_id',
       'sponsor_name', 'title_description', 'topics', 'labels', 'labels2',
       'transcript', 'text', 'embeddings', 'sponsor_embeddings',
       'sponsorship_similarity'],
      dtype='object')

In [122]:
#save the data
sponsored[['video_id','new_id','creator_id','creator_name','sponsor_id','sponsor_name','sponsorship_similarity']].to_csv(os.path.join(PATH, "sponsored_vca_aug2024.csv"), index=False)