# Calculate Content Similarity Metric (Partnership Level)

The similarity score is aggregated at the creator level instead of video level.

In [1]:
import pandas as pd
import numpy as np
import os
from top2vec import Top2Vec
import pickle
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm
2024-08-06 12:29:01.623692: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-06 12:29:01.707104: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-06 12:29:01.708294: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PATH = "/home/doosti@chapman.edu/projects/Facebook/top2vec/"
DATA_PATH = os.path.join(PATH,'data')
model_name = "top2vec_deeplearn_distiluse_notoken_2024-06-27.model"
model_path = os.path.join(DATA_PATH, model_name)
textdata_file = "data_all_text.csv"
textdata_path = os.path.join(DATA_PATH, textdata_file)
data_file = "pooled_us_aug2020.dta"
data_path = os.path.join(DATA_PATH, data_file)

In [3]:
# load model
model = Top2Vec.load(model_path)
topic_sizes, topic_nums = model.get_topic_sizes()
print(f"Number of topics: {len(topic_sizes)}")

Number of topics: 3936


In [4]:
# load text data
textdata = pd.read_csv(textdata_path, low_memory=False)
print(f"Text Data Size: {textdata.shape}")
# load data
data = pd.read_stata(data_path)
print(f"Data Size: {data.shape}")

Text Data Size: (820099, 8)
Data Size: (220033, 151)


In [5]:
# sanity checks
# control for video ids
print(f"Out of {data.shape[0]} rows, {data[data.video_id.astype(int).isin(textdata.video_id.values)].shape[0]} rows have video_id in textdata")
# control for creator ids
print(f"Out of {data.shape[0]} rows, {data[data.creator_id.isin(textdata.creator_id.values)].shape[0]} rows have creator_id in textdata")
# control for sponosr ids
print(f"Out of {data[data.sponsored==1].shape[0]} rows, {data[(data.sponsored==1) & (data.sponsor_id.isin(textdata.creator_id))].shape[0]} rows have sponsor_id in textdata")
# control for current similarity metrics
print(f"Number of rows with non-missing similarity: {data[data.similarity.notnull()].shape[0]}")

Out of 220033 rows, 138397 rows have video_id in textdata
Out of 220033 rows, 220033 rows have creator_id in textdata
Out of 34028 rows, 17337 rows have sponsor_id in textdata
Number of rows with non-missing similarity: 17273


In [6]:
with pd.option_context('display.float_format', '{:0.1f}'.format):
    print(data[~(data.video_id.astype(int)).isin(textdata.video_id.values)][['video_id','creator_id']])

                  video_id  creator_id
72773  10100462134286216.0  FKAjsn3tbe
72774  10100970241156152.0  yE4rDdIZMX
72775  10100985156021640.0  yE4rDdIZMX
72776  10100985156111460.0  yE4rDdIZMX
72791  10106352251543832.0  kOZJsafwog
...                    ...         ...
220023 10214037392314108.0  AM7pEDQGGW
220025 10214058075868220.0  HaF9jQYWeA
220027 10214156295289160.0  C2OyuzQa8y
220028 10214255689048236.0  hdbSjOZX99
220030 10214907346980188.0  eT9ecx4odm

[81636 rows x 2 columns]


In [7]:
# creating a function to check the corresponsing creator by video id
text_creator = dict(zip(textdata.video_id, textdata.creator_id))

In [8]:
# correcting for video id
data['new_id'] = data.video_id.astype(np.int64)

  data['new_id'] = data.video_id.astype(np.int64)


In [9]:
# those who match
mask = data.new_id.isin(textdata.video_id.values)
print(mask.sum())
for i,row in data[mask][['video_id','creator_id']].iterrows():
    creator = row.creator_id
    video_id = row.video_id
    if text_creator[video_id]!=creator:
        print(f"video_id: {video_id}, creator_id: {creator}, text_creator: {text_creator[video_id]}") 

138397


In [10]:
# one up or one down
new_ids = []
for i, row in data[~mask][['new_id','creator_id']].iterrows():
    creator = row.creator_id
    video_id = row.new_id
    if text_creator.get(video_id,None)==creator:
        raise "Something wrong happened!" # shouldn't happen
    elif text_creator.get(video_id+1,None)==creator:
        new_ids.append(video_id+1)
    elif text_creator.get(video_id-1,None)==creator:
        new_ids.append(video_id-1)
    else:
        new_ids.append(video_id)
data.loc[~mask,'new_id'] = new_ids

In [11]:
# check
data.new_id.isin(textdata.video_id.values).sum()

220033

### Calculate Sponsor Vectors

In [12]:
# calculate creator embeddings
document_vectors = model.document_vectors
creators = textdata['creator_id'].values

# Group document vectors by author
creator_to_vectors = defaultdict(list)

for creator, vector in zip(creators, document_vectors):
    creator_to_vectors[creator].append(vector)

creator_embeddings = {}

for creator, vectors in creator_to_vectors.items():
    # Calculate the mean vector (centroid) for each author
    creator_embeddings[creator] = np.mean(vectors, axis=0)
    # l2 normalization
    creator_embeddings[creator] /= np.linalg.norm(creator_embeddings[creator])

print(len(creator_embeddings))

4596


In [104]:
# save the creator embeddings
creator_embeddings_path = os.path.join(DATA_PATH, 'creator_embeddings.pkl')
with open(creator_embeddings_path, 'wb') as f:
    pickle.dump(creator_embeddings, f)

In [46]:
# load the creator embeddings
creator_embeddings_path = os.path.join(DATA_PATH, 'creator_embeddings.pkl')
with open(creator_embeddings_path, 'rb') as f:
    creator_embeddings = pickle.load(f)
print(len(creator_embeddings))

4596


In [13]:
creators_id2name = dict(zip(textdata['creator_id'], textdata['creator_name']))
creator_embeddings_names = {}

for creator, vectors in creator_embeddings.items():
    # two dict by id and name
    creator_embeddings_names[creators_id2name[creator]] = vectors


In [36]:
# calculate cosine similarity for creators
def get_similar_creators(creator, creator_embeddings, top_n=5):
    creator_embedding = creator_embeddings[creator]
    similarities = {}
    for key, value in creator_embeddings.items():
        similarities[key] = cosine_similarity([creator_embedding], [value])[0][0]
    similar_creators = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return similar_creators[:top_n]

def print_similar_creators(creator, top_n=5):
    similar_creators = get_similar_creators(creator, creator_embeddings_names, top_n)
    print(f"Creators similar to {creator}:")
    for i, (creator, similarity) in enumerate(similar_creators):
        print(f"{i+1:2d}. ({similarity:4.2f}) {creator}")

In [40]:
print_similar_creators('NFL')

Creators similar to NFL:
 1. (1.00) NFL
 2. (0.91) Sunday Night Football on NBC
 3. (0.91) ESPN
 4. (0.90) NBC Sports
 5. (0.89) CBS Sports


In [20]:
# calculate cosine similarity between two creators
def get_sponsorship_similarity(creator1_vector, creator2_embedding):
    return cosine_similarity([creator1_vector], [creator2_embedding])[0][0]

In [47]:
# calculate similarity between each document and each creator
new_metric = {} # key: (creator_id, sponsor_id), value: similarity
for i, row in data.drop_duplicates(['creator_id','sponsor_id']).iterrows():
    creator_id = row['creator_id']
    sponsor_id = row['sponsor_id']
    sponsor_name = row['sponsor_name']
    creator_name = row['creator_name']
    if (sponsor_id not in creator_embeddings) | (creator_id not in creator_embeddings):
        if sponsor_name in creator_embeddings_names:
            new_metric[(creator_id, sponsor_id)] = get_sponsorship_similarity(creator_embeddings[creator_id], creator_embeddings_names[sponsor_name])
        else:
            new_metric[(creator_id, sponsor_id)] = np.nan
    else:  
        new_metric[(creator_id, sponsor_id)] = get_sponsorship_similarity(creator_embeddings[creator_id], creator_embeddings[sponsor_id])

# add new metric to the data
data['similarity_agg'] = data.apply(lambda x: new_metric.get((x.creator_id, x.sponsor_id), np.nan), axis=1)
            

  data['similarity_agg'] = data.apply(lambda x: new_metric.get((x.creator_id, x.sponsor_id), np.nan), axis=1)


In [48]:
# explore the new metric
print(data.similarity_agg.describe())
print(data.similarity_agg.isnull().sum())

count    17355.000000
mean         0.540738
std          0.163068
min         -0.004425
25%          0.434317
50%          0.541313
75%          0.652583
max          0.974303
Name: similarity_agg, dtype: float64
202678


In [49]:
print(data[['similarity','similarity_agg']].notnull().sum())
print(data[['similarity','similarity_agg']].corr())
print(data[['similarity','similarity_agg']].describe())

similarity        17273
similarity_agg    17355
dtype: int64
                similarity  similarity_agg
similarity        1.000000        0.602025
similarity_agg    0.602025        1.000000
         similarity  similarity_agg
count  17273.000000    17355.000000
mean       0.683697        0.540738
std        0.075628        0.163068
min        0.504807       -0.004425
25%        0.628146        0.434317
50%        0.675194        0.541313
75%        0.732961        0.652583
max        0.954277        0.974303


In [66]:
# add the column to the data
data_jul = pd.read_csv(os.path.join(DATA_PATH, 'pooled_us_jul2024.csv'))
#data_jul['similarity_agg'] = data_jul.apply(lambda x: new_metric.get((x.creator_id, x.sponsor_id), np.nan), axis=1)
data_jul['similarity_agg'] = data['similarity_agg']
# sanity check for the new data (similarity_agg is the same for data and data_jul)
print((data.similarity_agg==data_jul.similarity_agg).sum())

17355


In [67]:
# save the data
data.to_csv(os.path.join(DATA_PATH,"pooled_us_aug2024.csv"), index=False)