In [2]:
# HPC deployment
SPARK_MASTER = 'spark://cm029:45471'
FILE_DIR = '/scratch/si2073/output'
FILENAME = FILE_DIR + '/enwiki-20230401-pages-meta-history10.xml-p4045403p4096288.csv'
TARGET_PATH = '/scratch/tmv7269/datasets'

In [3]:
import pandas as pd
import numpy as np
import re
import datasets
from datasets import load_dataset
from pathlib import Path

datasets.config.HF_DATASETS_CACHE = Path(TARGET_PATH)
datasets.config.DOWNLOADED_DATASETS_PATH = Path(TARGET_PATH)

In [4]:
# Load Cohere's full dataset
drop_columns = ['text', 'id', 'title', 'langs', 'paragraph_id', 'views']
docs = load_dataset(f"Cohere/wikipedia-22-12-en-embeddings", split="train").remove_columns(drop_columns)

Found cached dataset parquet (/scratch/tmv7269/datasets/Cohere___parquet/Cohere--wikipedia-22-12-en-embeddings-735980cfcb568494/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [6]:
docs

Dataset({
    features: ['url', 'wiki_id', 'emb'],
    num_rows: 35167920
})

In [7]:
docs[0]

{'url': 'https://en.wikipedia.org/wiki?curid=69407798',
 'wiki_id': 69407798,
 'emb': [0.2865696847438812,
  -0.03181683272123337,
  0.06668472290039062,
  0.03292645886540413,
  -0.008292825892567635,
  0.16873490810394287,
  -0.0008463106933049858,
  -0.36077880859375,
  0.33916592597961426,
  0.3886975646018982,
  -0.41489771008491516,
  0.20758016407489777,
  -0.11468425393104553,
  0.3873162567615509,
  -0.26252424716949463,
  0.007096240296959877,
  0.24166066944599152,
  -0.24653761088848114,
  0.060873936861753464,
  0.23045268654823303,
  -0.029800616204738617,
  0.5721306800842285,
  -0.051111623644828796,
  -0.09547730535268784,
  0.1097082868218422,
  -0.059516504406929016,
  -0.053682904690504074,
  0.23981636762619019,
  -0.33325839042663574,
  0.3685816824436188,
  0.18456950783729553,
  -0.05209290236234665,
  -0.006129484623670578,
  0.5033777952194214,
  -0.5287379026412964,
  0.5231741070747375,
  0.022464150562882423,
  -0.04248378053307533,
  0.3180341124534607,
  

In [None]:
df = docs.to_pandas()
df = df.drop_duplicates(subset=['wiki_id'],keep='first').set_index('wiki_id', drop=True)
df.head()

In [None]:
def lst_to_str(x):
    return re.sub(r'[^ \w+]', '', x).split()

columns = ['title', 'text', 'username', 'article_id']
df = pd.read_csv(FILENAME, names=columns, skiprows=1).drop(columns='text')
df['username'] = df.username.apply(lst_to_str)

In [None]:
len(df)

In [None]:
# setting up spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
import pyspark

spark = SparkSession.builder.master(SPARK_MASTER) \
                            .config('spark.executor.memory', '4G') \
                            .config('spark.driver.memory', '60G') \
                            .config("spark.memory.offHeap.enabled", True) \
                            .config("spark.memory.offHeap.size","16g") \
                            .appName('ProcessData') \
                            .getOrCreate()

spark

In [None]:
spark_df = spark.createDataFrame(df)

In [None]:
spark_df.head()

In [None]:
from pyspark.sql.functions import explode, collect_list, udf, count
from collections import Counter

exploded = spark_df.select(spark_df.article_id, explode(spark_df.username).alias('username'))
collapsed = exploded.groupBy("username", "article_id").agg(count("article_id").alias('count'))
collapsed.show()

In [None]:
from typing import List, Tuple
import numpy as np
import math
import itertools

def get_editors_list(editor):
    return collapsed.filter(collapsed.username == editor).select('article_id', 'count').rdd.map(tuple).collect()

# each editor has a list of (article_id, count) of the things they edited
# assuming all id in list is unique
def calculate_editor_variance(editors : List[Tuple[int, int]]):
    editors = [edit for edit in editors if get_embedding(edit[0]) != None]
    
    if len(editors) == 0: # we don't know anything about the articles they edit
        return -1 
    
    article_ids, weights = zip(*editors)
    matrix = [get_embedding(article_id) for article_id in article_ids]
    return calculate_multid_distance(matrix, weights)

# get embedding of an article. either LDA or Cohere's dataset
# using Cohere rn
def get_embedding(article_id):
    if article_id not in df:
        return None
    return df.loc[article_id , 'emb']

"""
Take in matrix of article x embedding and article's respective weight, return their variance in k-dimension
this is easier to interpret when using LDA (since you can see how far editors stray on certain topic, 
    assuming topic itself is indepedent)
With vaguer embedding, it's better to use the L2 norm and compress it to a single number
"""
def calculate_multid_variance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    variance = np.average((matrix-average)**2, weights=weights, axis=0)
    return np.sqrt(variance)

"""
Calculate L2 distance for each d-dimension point
"""
def calculate_multid_distance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    l2 = np.linalg.norm(matrix-average, axis=1)
    return np.average(l2)

In [None]:
editors_collapsed = collapsed.groupby('username').agg(collect_list('article_id').alias('ids'), collect_list('count').alias('counts'))
editors_collapsed.show()

In [None]:
# you cannot parallelize this because read lock
for row in editors_collapsed.collect():
    variance = calculate_editor_variance(zip(row.ids, row.counts))
    if variance != -1:
        print(row.username, variance)