In [1]:
# HPC deployment
SPARK_MASTER = 'spark://cm020:33010'
FILE_DIR = '/scratch/si2073/output'
FILENAME = FILE_DIR + '/enwiki-20230401-pages-meta-history10.xml-p4045403p4096288.csv'
TARGET_PATH = '/scratch/tmv7269/datasets'

In [2]:
import pandas as pd
import numpy as np
import re
import datasets
from datasets import load_dataset
from pathlib import Path

datasets.config.HF_DATASETS_CACHE = Path(TARGET_PATH)

In [None]:
# Load Cohere's full dataset
drop_columns = ['text', 'id', 'title', 'url', 'langs', 'paragraph_id']
docs = load_dataset(f"Cohere/wikipedia-22-12-en-embeddings", split="train").remove_columns(drop_columns)
COL = len(docs[0]['emb'])

Downloading and preparing dataset wikipedia-22-12-embeddings/en (download: 28.06 GiB, generated: 117.48 GiB, post-processed: Unknown size, total: 145.54 GiB) to /scratch/tmv7269/datasets/Cohere___parquet/Cohere--wikipedia-22-12-en-embeddings-735980cfcb568494/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/474M [00:00<?, ?B/s]

In [None]:
df = docs.to_pandas()
df = df.drop_duplicates(subset=['wiki_id'],keep='first').set_index('wiki_id', drop=True)
df.head()

In [None]:
def lst_to_str(x):
    return re.sub(r'[^ \w+]', '', x).split()

columns = ['title', 'text', 'username', 'article_id']
df = pd.read_csv(FILENAME, names=columns, skiprows=1).drop(columns='text')
df['username'] = df.username.apply(lst_to_str)

In [None]:
df.head()

In [None]:
# setting up spark
from pyspark.sql import SparkSession
import pyspark

spark = SparkSession.builder.master(SPARK_MASTER) \
                            .config('spark.executor.memory', '4G') \
                            .config('spark.driver.memory', '60G') \
                            .config("spark.memory.offHeap.enabled", True) \
                            .config("spark.memory.offHeap.size","16g") \
                            .appName('ProcessData') \
                            .getOrCreate()

spark

In [None]:
spark_df = spark.createDataFrame(df)

In [None]:
spark_df.head()

In [None]:
from pyspark.sql.functions import explode, collect_list, udf, count
from collections import Counter

exploded = spark_df.select(spark_df.article_id, explode(spark_df.username).alias('username'))
collapsed = exploded.groupBy("username", "article_id").agg(count("article_id").alias('count'))
collapsed.show()

In [None]:
def get_editors_list(editor):
    return collapsed.filter(collapsed.username == editor).select('article_id', 'count').rdd.map(tuple).collect()
get_editors_list('Alaibot')

In [None]:
from typing import List, Tuple
import numpy as np
import math

# each editor has a list of (article_id, count) of the things they edited
# assuming all id in list is unique
def calculate_editor_variance(editors : List[Tuple[int, int]]):
    article_ids, weights = zip(*editors)
    matrix = [get_embedding(article_id) for article_id in article_ids]
    return calculate_multid_distance(matrix, weights)

# get embedding of an article. either LDA or Cohere's dataset
# using Cohere rn
def get_embedding(article_id):
    if article_id not in df:
        print(f"Didn't find article {article_id}")
        return np.random.rand(COL)
    return df.loc[article_id , 'emb']

"""
Take in matrix of article x embedding and article's respective weight, return their variance in k-dimension
this is easier to interpret when using LDA (since you can see how far editors stray on certain topic, 
    assuming topic itself is indepedent)
With vaguer embedding, it's better to use the L2 norm and compress it to a single number
"""
def calculate_multid_variance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    variance = np.average((matrix-average)**2, weights=weights, axis=0)
    return np.sqrt(variance)

"""
Calculate L2 distance for each d-dimension point
"""
def calculate_multid_distance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    l2 = np.linalg.norm(matrix-average, axis=1)
    return np.average(l2)

In [None]:
calculate_editor_variance(get_editors_list('Alaibot'))