## Calculate Content Similarity (Aug 16, 2024)

In this file, I incorporate sponsor description and video labels.
The similarity is calculated directly using the embedding vectors from google text embedding model with no topic model.

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
PATH = "/home/doosti@chapman.edu/projects/Facebook/top2vec/data/"
# import data
data = pd.read_csv(os.path.join(PATH, "vca_combined_2024-08-13.csv"))
print(f"Combined data size: {data.shape[0]}")
# sponsored videos
sponsored = pd.read_csv(os.path.join(PATH, "sponsored_videos_data.csv"))
print(f"Sponsored videos size: {sponsored.shape[0]}")
# sponsors
sponsors = pd.read_csv(os.path.join(PATH, "sponsor_description.csv"))
print(f"Sponsors size: {sponsors.shape[0]}")
sponsored_vca = sponsored[sponsored.labels.notnull()].copy()
print(f"Sponsored videos with labels size: {sponsored_vca.shape[0]}")
with open(os.path.join(PATH, "embeddings.pkl"), "rb") as f:
    embeddings = pickle.load(f)
print(f"Embeddings size: {len(embeddings)}")
with open(os.path.join(PATH, "embeddings_vca.pkl"), "rb") as f:
    embeddings_vca = pickle.load(f)
print(f"Embeddings size (vca): {len(embeddings_vca)}")

Combined data size: 38077
Sponsored videos size: 34028
Sponsors size: 4049
Sponsored videos with labels size: 9519
Embeddings size: 38077
Embeddings size (vca): 9519


In [12]:
sponsors['embeddings'] = embeddings[:sponsors.shape[0]]
sponsored_vca['embeddings'] = embeddings_vca

In [16]:
sponsors.isnull().sum()

sponsor_id              216
sponsor_name              0
sponsor_description       0
sponsor_themes         2222
text                      0
embeddings                0
dtype: int64

In [21]:
sponsored_vca[['sponsor_name']].merge(sponsors, on='sponsor_name', how='inner').shape

(9457, 6)

In [22]:
# find the sponsor embedding for each sponsored video
sponsored_vca['sponsor_embeddings'] = sponsored_vca[['sponsor_name']].merge(sponsors, on='sponsor_name', how='left')['embeddings'].values
print(sponsored_vca.isnull().sum())

video_id                 0
new_id                   0
creator_id               0
creator_name             0
sponsor_id               4
sponsor_name             0
title_description        0
topics                5765
labels                   0
labels2                  0
transcript            5035
text                     0
embeddings               0
sponsor_embeddings      62
dtype: int64


In [23]:
# function to find similar sponsors
def find_similar_sponsors(sponsor_name, n=5):
    sponsor_embedding = sponsors[sponsors['sponsor_name']==sponsor_name]['embeddings'].values[0]
    cosine_sim = cosine_similarity([sponsor_embedding], sponsors['embeddings'].tolist())
    similar_sponsors = sponsors.iloc[np.argsort(cosine_sim[0])[-n-1:-1]]['sponsor_name'].values
    return list(zip(similar_sponsors[::-1], np.sort(cosine_sim[0])[-n-1:-1][::-1]))

In [24]:
sponsor_sample = sponsors.sample(1)['sponsor_name'].values[0]
print(f"Sample sponsor: {sponsor_sample}")
for (sim_sponsor, score) in find_similar_sponsors(sponsor_sample, n=10):
    # print the score with 2 decimal points
    print(f"{sim_sponsor:30s} | {score:.2f}")


Sample sponsor: Gap
Gap Inc.                       | 0.80
Tory Burch                     | 0.71
Old Navy                       | 0.70
JCPenney                       | 0.69
ELLE Magazine (US)             | 0.67
Electro Threads                | 0.66
Cotton                         | 0.66
Victoria's Secret PINK         | 0.66
Chico's                        | 0.65
American Eagle                 | 0.65


In [27]:
# function to find similar sponsors
def find_similar_videos(sponsor_name, n=5):
    sponsor_embedding = sponsors[sponsors['sponsor_name']==sponsor_name]['embeddings'].values[0]
    cosine_sim = cosine_similarity([sponsor_embedding], sponsored_vca['embeddings'].tolist())
    similar_videos = sponsored_vca.iloc[np.argsort(cosine_sim[0])[-n-1:-1]][['title_description','creator_name','sponsor_name']].values
    return list(zip(similar_videos[::-1], np.sort(cosine_sim[0])[-n-1:-1][::-1]))

sponsor_sample = sponsors.sample(1)['sponsor_name'].values[0]

print(f"Sample sponsor: {sponsor_sample}")
print("Similar videos:")
for (sim_vid, score) in find_similar_videos(sponsor_sample, n=5):
    print(f"{sim_vid[0][:30]:30s}..., {sim_vid[1]:30s}, {sim_vid[2][:30]:30} | {score:.2f}")

Sample sponsor: CheapOair
Similar videos:
What gets you outdoors?! Nearl..., Drury Outdoors                , Cabela's                       | 0.61
We're going going, back back, ..., Carolina Panthers             , PRIMESPORT                     | 0.61
The Runner Debrief: Mat Pat im..., We Are The Mighty             , The Runner go90                | 0.61
Katy Perry Bursts Our Bubbles ..., Billboard                     , American Airlines              | 0.60
Top 5 Reasons to Share the Rin..., Crypt TV                      , Rings Movie                    | 0.60


In [30]:
# find sponsor for a sample video
def find_similar_sponsor(video_id, n=5):
    embedding = sponsored_vca[sponsored_vca['video_id']==video_id]['embeddings'].values[0]
    cosine_sim = cosine_similarity([embedding], sponsors['embeddings'].tolist())
    similar_sponsors = sponsors.iloc[np.argsort(cosine_sim[0])[-n-1:-1]][['sponsor_name']].values
    return list(zip(similar_sponsors[::-1], np.sort(cosine_sim[0])[-n-1:-1][::-1]))

video_sample = sponsored_vca.sample(1)['video_id'].values[0]
creator_name = sponsored_vca[sponsored_vca['video_id']==video_sample]['creator_name'].values[0]
sponsor_name = sponsored_vca[sponsored_vca['video_id']==video_sample]['sponsor_name'].values[0]

print(f"Sample video: {video_sample}, {creator_name}, {sponsor_name}")
print("Similar sponsors:")
for (sim, score) in find_similar_sponsor(video_sample, n=5):
    print(f"{sim[0][:30]:30s}... | {score:.2f}")

Sample video: 10154133578596464, Southern Living, IKEA
Similar sponsors:
Lone Star Percussion          ... | 0.65
Texas Pete Hot Sauce          ... | 0.64
DoubleTree by Hilton          ... | 0.63
Woodwind & Brasswind          ... | 0.63
BET                           ... | 0.62


In [31]:
# calculate cosine similarity between a document and a creator
def get_sponsorship_similarity(document_vector, creator_embedding):
    if type(creator_embedding) != np.ndarray:
        return np.nan
    return cosine_similarity([document_vector], [creator_embedding])[0][0]

In [32]:
from tqdm.auto import tqdm
tqdm.pandas()

sponsored_vca['sponsorship_similarity'] = sponsored_vca.progress_apply(lambda x: get_sponsorship_similarity(x['embeddings'], x['sponsor_embeddings']), axis=1).values

print(sponsored_vca.sponsorship_similarity.describe())
print(sponsored_vca.sponsorship_similarity.isnull().sum())

  from .autonotebook import tqdm as notebook_tqdm
  1%|▏         | 131/9519 [00:00<00:07, 1307.73it/s]

100%|██████████| 9519/9519 [00:06<00:00, 1478.13it/s]

count    9457.000000
mean        0.513269
std         0.083727
min         0.325400
25%         0.445398
50%         0.503910
75%         0.577432
max         0.789858
Name: sponsorship_similarity, dtype: float64
62





In [35]:
#save the data
sponsored_vca[['video_id','new_id','creator_id','creator_name','sponsor_id','sponsor_name','sponsorship_similarity']].to_csv(os.path.join(PATH, "sponsored_vca_aug2024_v3.csv"), index=False)

In [176]:
sponsored['include_vca'] = sponsored.labels.notnull().astype(int)
sponsored['include_topics'] = sponsored.topics.notnull().astype(int)
sponsored['include_transcript'] = sponsored.transcript.notnull().astype(int)
#sponsored[['video_id','new_id','creator_id','creator_name','sponsor_id','sponsor_name','sponsorship_similarity','include_vca','include_topics','include_transcript']].to_csv(os.path.join(PATH, "sponsored_vca_aug2024_v2.csv"), index=False)