# Calculate Content Similarity Metric

In [2]:
import pandas as pd
import numpy as np
import os
from top2vec import Top2Vec
import pickle
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm
2024-07-01 15:37:02.065407: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-01 15:37:02.149619: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-01 15:37:02.150718: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
PATH = "/home/doosti@chapman.edu/projects/Facebook/top2vec/"
DATA_PATH = os.path.join(PATH,'data')
model_name = "top2vec_deeplearn_distiluse_notoken_2024-06-27.model"
model_path = os.path.join(DATA_PATH, model_name)
textdata_file = "data_all_text.csv"
textdata_path = os.path.join(DATA_PATH, textdata_file)
data_file = "pooled_us_aug2020.dta"
data_path = os.path.join(DATA_PATH, data_file)

In [5]:
# load model
model = Top2Vec.load(model_path)
topic_sizes, topic_nums = model.get_topic_sizes()
print(f"Number of topics: {len(topic_sizes)}")

Number of topics: 3936


In [6]:
# load text data
textdata = pd.read_csv(textdata_path, low_memory=False)
print(f"Text Data Size: {textdata.shape}")
# load data
data = pd.read_stata(data_path)
print(f"Data Size: {data.shape}")

Text Data Size: (820099, 8)
Data Size: (220033, 151)


In [17]:
# sanity checks
# control for video ids
print(f"Out of {data.shape[0]} rows, {data[data.video_id.astype(int).isin(textdata.video_id.values)].shape[0]} rows have video_id in textdata")
# control for creator ids
print(f"Out of {data.shape[0]} rows, {data[data.creator_id.isin(textdata.creator_id.values)].shape[0]} rows have creator_id in textdata")
# control for sponosr ids
print(f"Out of {data[data.sponsored==1].shape[0]} rows, {data[(data.sponsored==1) & (data.sponsor_id.isin(textdata.creator_id))].shape[0]} rows have sponsor_id in textdata")
# control for current similarity metrics
print(f"Number of rows with non-missing similarity: {data[data.similarity.notnull()].shape[0]}")

Out of 220033 rows, 138397 rows have video_id in textdata
Out of 220033 rows, 220033 rows have creator_id in textdata
Out of 34028 rows, 17337 rows have sponsor_id in textdata
Number of rows with non-missing similarity: 17273


In [None]:
with pd.option_context('display.float_format', '{:0.1f}'.format):
    print(data[~(data.video_id.astype(int)).isin(textdata.video_id.values)][['video_id','creator_id']])

In [31]:
# creating a function to check the corresponsing creator by video id
text_creator = dict(zip(textdata.video_id, textdata.creator_id))

In [29]:
# correcting for video id
data['new_id'] = data.video_id.astype(int)

  data['new_id'] = data.video_id.astype(int)


In [32]:
# those who match
mask = data.new_id.isin(textdata.video_id.values)
print(mask.sum())
for i,row in data[mask][['video_id','creator_id']].iterrows():
    creator = row.creator_id
    video_id = row.video_id
    if text_creator[video_id]!=creator:
        print(f"video_id: {video_id}, creator_id: {creator}, text_creator: {text_creator[video_id]}") 

138397


In [42]:
# one up or one down
new_ids = []
for i, row in data[~mask][['new_id','creator_id']].iterrows():
    creator = row.creator_id
    video_id = row.new_id
    if text_creator.get(video_id,None)==creator:
        raise "Something wrong happened!" # shouldn't happen
    elif text_creator.get(video_id+1,None)==creator:
        new_ids.append(video_id+1)
    elif text_creator.get(video_id-1,None)==creator:
        new_ids.append(video_id-1)
    else:
        new_ids.append(video_id)
data.loc[~mask,'new_id'] = new_ids

In [43]:
# check
data.new_id.isin(textdata.video_id.values).sum()

220033

### Calculate Sponsor Vectors

In [98]:
# calculate creator embeddings
document_vectors = model.document_vectors
creators = textdata['creator_id'].values

# Group document vectors by author
creator_to_vectors = defaultdict(list)

for creator, vector in zip(creators, document_vectors):
    creator_to_vectors[creator].append(vector)

creator_embeddings = {}

for creator, vectors in creator_to_vectors.items():
    # Calculate the mean vector (centroid) for each author
    creator_embeddings[creator] = np.mean(vectors, axis=0)
    # l2 normalization
    creator_embeddings[creator] /= np.linalg.norm(creator_embeddings[creator])

print(len(creator_embeddings))

TypeError: 'dict' object is not callable

In [104]:
# save the creator embeddings
creator_embeddings_path = os.path.join(DATA_PATH, 'creator_embeddings.pkl')
with open(creator_embeddings_path, 'wb') as f:
    pickle.dump(creator_embeddings, f)

In [46]:
# load the creator embeddings
creator_embeddings_path = os.path.join(DATA_PATH, 'creator_embeddings.pkl')
with open(creator_embeddings_path, 'rb') as f:
    creator_embeddings = pickle.load(f)

print(len(creator_embeddings))

creators_id2name = dict(zip(textdata['creator_id'], textdata['creator_name']))
creator_embeddings_names = {}

for creator, vectors in creator_embeddings.items():
    # two dict by id and name
    creator_embeddings_names[creators_id2name[creator]] = vectors


4596


In [47]:
# calculate cosine similarity for creators
def get_similar_creators(creator, creator_embeddings, top_n=5):
    creator_embedding = creator_embeddings[creator]
    similarities = {}
    for key, value in creator_embeddings.items():
        similarities[key] = cosine_similarity([creator_embedding], [value])[0][0]
    similar_creators = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return similar_creators[:top_n]

def print_similar_creators(creator, similar_creators):
    print(f"Creators similar to {creator}:")
    for i, (creator, similarity) in enumerate(similar_creators):
        print(f"{i+1:2d}. ({similarity:4.2f}) {creator}")

In [48]:
# calculate cosine similarity between a document and a creator
def get_sponsorship_similarity(document_vector, creator_embedding):
    return cosine_similarity([document_vector], [creator_embedding])[0][0]

In [49]:
# get a dictionary of document embeddings
document_vectors = model.document_vectors
video_ids = textdata['video_id'].values

#video_to_vector = dict(zip(video_ids, document_vectors))
video_to_vector = {}
for i, video_id in enumerate(video_ids):
    video_to_vector[video_id] = document_vectors[i]

In [54]:
# calculate similarity between each document and each creator
new_metric = []
for i, row in data.iterrows():
    video_id = row['new_id']
    creator_id = row['creator_id']
    sponsor_id = row['sponsor_id']
    sponsor_name = row['sponsor_name']
    if row['sponsored'] == 0:
        new_metric.append(np.nan)
    elif (video_id not in video_to_vector):
        new_metric.append(np.nan)
    elif (sponsor_id not in creator_embeddings):
        if sponsor_name not in creator_embeddings_names:
            new_metric.append(np.nan)
            if ~np.isnan(row['similarity']):
                print(video_id, sponsor_id, sponsor_name, row['similarity'])
        else:
            new_metric.append(get_sponsorship_similarity(video_to_vector[video_id], creator_embeddings_names[sponsor_name]))
    else:
        new_metric.append(get_sponsorship_similarity(video_to_vector[video_id], creator_embeddings[sponsor_id]))


688593767965037 KFeKzFHyhL Metro PCS 0.7335498332977295
1162201353815813 KFeKzFHyhL Metro PCS 0.7335498332977295


In [55]:
data['similarity_new'] = new_metric

In [56]:
print(data[['similarity','similarity_new']].notnull().sum())
print(data[['similarity','similarity_new']].corr())
print(data[['similarity','similarity_new']].describe())

similarity        17273
similarity_new    17343
dtype: int64
                similarity  similarity_new
similarity        1.000000        0.332057
similarity_new    0.332057        1.000000
         similarity  similarity_new
count  17273.000000    17343.000000
mean       0.683697        0.338835
std        0.075628        0.160717
min        0.504807       -0.197543
25%        0.628146        0.223607
50%        0.675194        0.331094
75%        0.732961        0.452767
max        0.954277        0.853619


In [59]:
# save the data
data.to_csv(os.path.join(DATA_PATH,"pooled_us_jul2024.csv"), index=False)

In [58]:
data.to_stata(os.path.join(DATA_PATH,"pooled_us_jul2024.dta"), write_index=False)

/tmp/ipykernel_18847/3119501987.py:1: PossiblePrecisionLoss: 
Column converted from int64 to float64, and some data are outside of the lossless
conversion range. This may result in a loss of precision in the saved data.

  data.to_stata("pooled_us_jul2024.dta", write_index=False)


ValueError: 
Fixed width strings in Stata .dta files are limited to 244 (or fewer)
characters.  Column 'themes' does not satisfy this restriction. Use the
'version=117' parameter to write the newer (Stata 13 and later) format.
