In [1]:
from datasets import load_dataset
import pandas as pd
import time
import numpy as np

In [2]:
docs = load_dataset(f"Cohere/wikipedia-22-12-simple-embeddings", split="train").remove_columns('text')

Found cached dataset parquet (C:/Users/vomin/.cache/huggingface/datasets/Cohere___parquet/Cohere--wikipedia-22-12-simple-embeddings-94deea3d55a22093/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [3]:
docs

Dataset({
    features: ['id', 'title', 'url', 'wiki_id', 'views', 'paragraph_id', 'langs', 'emb'],
    num_rows: 485859
})

In [4]:
len(docs)

485859

In [5]:
import pyspark
from pyspark import SparkContext, SparkConf

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = pyspark.sql.SparkSession.builder.getOrCreate()

spark

In [None]:
spark_df = spark.createDataFrame(docs)
spark_df.set_index('id', inplace=True)
print(spark_df.columns, len(spark_df))

In [None]:
df = pd.DataFrame(docs)

In [6]:
from typing import List, Tuple
import numpy as np
import math

# each editor has a list of (article_id, count) of the things they edited
# assuming all id in list is unique
def calculate_editor_variance(editors : List[Tuple[int, int]]):
    article_ids, weights = zip(*editors)
    matrix = [get_embedding(article_id) for article_id in article_ids]
    return calculate_multid_distance(matrix, weights)

# get embedding of an article. either LDA or Cohere's dataset
def get_embedding(article_id):
    pass

"""
Take in matrix of article x embedding and article's respective weight, return their variance in k-dimension
this is easier to interpret when using LDA (since you can see how far editors stray on certain topic, 
    assuming topic itself is indepedent)
With vaguer embedding, it's better to use the L2 norm and compress it to a single number
"""
def calculate_multid_variance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    variance = np.average((matrix-average)**2, weights=weights, axis=0)
    return np.sqrt(variance)

"""
Calculate L2 distance for each d-dimension point
"""
def calculate_multid_distance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    l2 = np.linalg.norm(matrix-average, axis=1)
    return np.average(l2)

In [7]:
test_data = pd.DataFrame(np.random.uniform(0,1,size=(4, 5)))
weights = np.random.randint(0,100,size=(1,4))[0]

In [8]:
test_data

Unnamed: 0,0,1,2,3,4
0,0.792927,0.756664,0.636127,0.690646,0.462928
1,0.478519,0.385372,0.199834,0.733163,0.594628
2,0.809893,0.868929,0.694977,0.983426,0.261241
3,0.47965,0.275843,0.062372,0.115108,0.666105


In [9]:
weights

array([ 9, 19, 57, 60])

In [10]:
print(calculate_multid_distance(test_data, weights))
print(calculate_multid_variance(test_data, weights))

0.5179129267868693
[0.16348741 0.27799026 0.29805536 0.39648004 0.18670579]
