In [1]:
# HPC deployment
SPARK_MASTER = 'spark://cm015:15703'
FILE_DIR = '/scratch/si2073/output'
FILENAME = FILE_DIR + '/enwiki-20230401-pages-meta-history10.xml-p4045403p4096288.csv'
TARGET_PATH = '/scratch/si2073/datasets'
CACHE_PATH = '/scratch/si2073/huggingface_cache'

In [14]:
import pandas as pd
import numpy as np
import re
import datasets
from datasets import load_dataset
from huggingface_hub import snapshot_download
from pathlib import Path
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import explode, collect_list, udf, count
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from collections import Counter
from tqdm import tqdm

os.environ["HF_HOME"] = TARGET_PATH
os.environ["HF_DATASETS_CACHE"] = CACHE_PATH

In [5]:
snapshot_download(repo_id="Cohere/wikipedia-22-12-en-embeddings", 
                  local_dir=TARGET_PATH,
                  cache_dir=CACHE_PATH,
                  repo_type="dataset")

Fetching 256 files:   0%|          | 0/256 [00:00<?, ?it/s]

'/scratch/si2073/datasets'

In [3]:
conf = SparkConf().setAppName("Wikipedia") \
                  .set("spark.driver.memory", "128g") \
                  .set("spark.executor.memory", "128g")


sc = SparkContext(master = SPARK_MASTER, conf=conf)
spark = SparkSession(sc)
spark

23/05/07 17:37:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
df = spark.read.parquet("/scratch/si2073/datasets/data")
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-------+-------------+--------------------+--------------------+-------+---------+------------+-----+--------------------+
|     id|        title|                text|                 url|wiki_id|    views|paragraph_id|langs|                 emb|
+-------+-------------+--------------------+--------------------+-------+---------+------------+-----+--------------------+
|3197092|Rye, New York|Flooding has long...|https://en.wikipe...| 260184|1327.8843|          20|   31|[0.10047981, 0.09...|
|3197093|Rye, New York|The City's respon...|https://en.wikipe...| 260184|1327.8843|          21|   31|[0.41874716, 0.01...|
|3197094|Rye, New York|Starting on Septe...|https://en.wikipe...| 260184|1327.8843|          22|   31|[-0.17743246, -0....|
|3197095|Rye, New York|As of 2010, seven...|https://en.wikipe...| 260184|1327.8843|          23|   31|[0.28458962, -0.0...|
|3197096|Rye, New York|The presence of I...|https://en.wikipe...| 260184|1327.8843|          24|   31|[0.20566246, -0.2...|
|3197097

                                                                                

In [5]:
# Load Cohere's full dataset
drop_columns = ['text', 'id', 'title', 'url', 'langs', 'paragraph_id']
df = df.drop(*drop_columns)
df.show()

[Stage 2:>                                                          (0 + 1) / 1]

+-------+---------+--------------------+
|wiki_id|    views|                 emb|
+-------+---------+--------------------+
| 260184|1327.8843|[0.10047981, 0.09...|
| 260184|1327.8843|[0.41874716, 0.01...|
| 260184|1327.8843|[-0.17743246, -0....|
| 260184|1327.8843|[0.28458962, -0.0...|
| 260184|1327.8843|[0.20566246, -0.2...|
| 260184|1327.8843|[0.543221, -0.185...|
| 260184|1327.8843|[0.08567986, -0.1...|
| 260184|1327.8843|[0.10039139, -0.2...|
| 260184|1327.8843|[0.23149109, -0.4...|
| 260184|1327.8843|[0.30025, -0.3781...|
| 260184|1327.8843|[0.088352956, -0....|
| 260184|1327.8843|[0.13139942, -0.2...|
| 260184|1327.8843|[-0.009606386, -0...|
| 260184|1327.8843|[0.5128174, -0.09...|
| 260184|1327.8843|[0.29922315, -0.3...|
| 260184|1327.8843|[0.42379922, -0.2...|
| 260184|1327.8843|[-0.0057834624, -...|
| 260184|1327.8843|[0.15442298, -0.1...|
| 260184|1327.8843|[0.3555656, 0.360...|
| 260184|1327.8843|[0.329118, -0.414...|
+-------+---------+--------------------+
only showing top

                                                                                

In [6]:
df = df.drop_duplicates(subset=['wiki_id'])
df.show()



+-------+----------+--------------------+
|wiki_id|     views|                 emb|
+-------+----------+--------------------+
|   1088| 1287.1409|[0.24366963, -0.6...|
|   1580|101.960556|[0.0684566, 0.260...|
|   1645| 1889.2782|[0.3224766, 0.458...|
|   2122| 2863.4043|[0.111770935, 0.4...|
|   2866| 1365.1624|[-0.2080042, -0.2...|
|   3175|  713.8335|[0.16152096, 0.30...|
|   3794| 1814.0396|[0.051881086, 0.4...|
|   3997| 2391.4963|[0.27129447, 0.53...|
|   4101| 1018.6534|[0.34501484, -0.1...|
|   4519| 1874.5988|[0.33576313, 0.24...|
|   5300| 1882.7058|[0.3257199, -0.34...|
|   6336|   779.848|[0.38157865, 0.15...|
|   6357| 2248.5383|[0.39418507, -0.2...|
|   6466| 2987.0244|[-0.1551857, 1.00...|
|   6620| 372.99713|[0.12537971, 0.51...|
|   6654|  2384.627|[-0.105314195, 0....|
|   7253| 522.30115|[0.25702828, 0.55...|
|   7554| 1865.1218|[0.2620884, 0.488...|
|   7833|  659.7683|[0.24061485, 0.41...|
|   8086| 1174.7714|[0.3114842, -0.26...|
+-------+----------+--------------

                                                                                

In [9]:
COL = df.count()
COL

                                                                                

5745033

In [None]:
def lst_to_str(x):
    return re.sub(r'[^ \w+]', '', x).split()

columns = ['title', 'text', 'username', 'article_id']

schema = StructType([
    StructField("username", ArrayType(StringType()), True),
    StructField("title", StringType(), True),
    StructField("article_id", StringType(), True)
])

df_full = spark.createDataFrame([], schema)

for f in tqdm(os.listdir(FILE_DIR), desc="Processing Files"):
    
    df_temp = pd.read_csv(os.path.join(FILE_DIR, f), 
                          names=columns, 
                          skiprows=1).drop(columns="text")
    df_temp["username"] = df_temp["username"].apply(lst_to_str)
    df_part = spark.createDataFrame(df_temp[["username", "title", "article_id"]], schema=schema)

    df_full = df_full.union(df_part)

Processing Files:  40%|████      | 320/793 [18:32<23:21,  2.96s/it] 

In [17]:
df_full.show()

                                                                                

Row(title='Foster Air Force Base', username=['Dirtydan667', 'Dirtydan667', 'RussBot', 'Ktr101', 'Ktr101', 'Ktr101', 'Ktr101', 'Ndunruh', 'PigFlu', 'Oink', 'Bwmoll3', 'LilHelpa', 'ColoBill4', 'ColoBill4', 'Ground', 'Zero', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Vanished', 'user', '31415926535897932384626433', 'Firsfron', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'Bwmoll3', 'DiiCinta', 'Bwmoll3', 'Bwmoll3', 'Backspace', 'Rich', 'Farmbrough', 'Mark', 'Sublette', 'Mark', 'Sublette', 'Mark', 'Sublette', 'Kumioko', 'renamed', 'Ulric1313', 'Mark', 'Sublette', 'Ulric1313', 'Mark', 'Sublette', 'Lightmouse', 'Gcstackmone

In [18]:


exploded = spark_df.select(spark_df.article_id, explode(spark_df.username).alias('username'))
collapsed = exploded.groupBy("username", "article_id").agg(count("article_id").alias('count'))
collapsed.show()



+--------------+----------+-----+
|      username|article_id|count|
+--------------+----------+-----+
|          Dude|   4045430|    2|
|       Ouṃkāra|   4045432|    1|
|         Hardy|   4045466|    1|
|           AWB|   4045519|    1|
|        NoNick|   4045554|    1|
|       Deville|   4045577|    1|
|       Cydebot|   4045581|    3|
|         Yobot|   4045592|    3|
|        native|   4045603|    1|
|     Cholmes75|   4045651|    1|
|        Namiba|   4045651|    1|
|     KasparBot|   4045651|    1|
|             D|   4045672|    1|
|           Red|   4045691|    1|
|        Bubige|   4045705|    1|
|       SineBot|   4045732|    4|
|          John|   4045760|    1|
|        Samf4u|   4045760|    1|
|12drinkminimum|   4045760|    1|
|       Brukner|   4045770|    1|
+--------------+----------+-----+
only showing top 20 rows



                                                                                

In [19]:
def get_editors_list(editor):
    return collapsed.filter(collapsed.username == editor).select('article_id', 'count').rdd.map(tuple).collect()
get_editors_list('Alaibot')

[(4058784, 1),
 (4093691, 1),
 (4066879, 1),
 (4048070, 1),
 (4075498, 1),
 (4056620, 1),
 (4072072, 1),
 (4073069, 1),
 (4083613, 1),
 (4047561, 1),
 (4091899, 1),
 (4049805, 1),
 (4079113, 1),
 (4069224, 1),
 (4069254, 1),
 (4093158, 1),
 (4046974, 1),
 (4063484, 1),
 (4069057, 1),
 (4049203, 1),
 (4067262, 1),
 (4060538, 1),
 (4092422, 1),
 (4059097, 1),
 (4059423, 1),
 (4049833, 1),
 (4072680, 1),
 (4063762, 1),
 (4074422, 1),
 (4076967, 1),
 (4047596, 1),
 (4050255, 1),
 (4094800, 1),
 (4073583, 1),
 (4072649, 1),
 (4055736, 1),
 (4079630, 1),
 (4059506, 1),
 (4062975, 1),
 (4048701, 1),
 (4085377, 1),
 (4061189, 1),
 (4088370, 1),
 (4083404, 1),
 (4059278, 2),
 (4059305, 2),
 (4058134, 1),
 (4078737, 1),
 (4094880, 1),
 (4063615, 1),
 (4047939, 1),
 (4056078, 1),
 (4065323, 1),
 (4091024, 1),
 (4079664, 1),
 (4049188, 1),
 (4082701, 1),
 (4072587, 1),
 (4092100, 1),
 (4063395, 1),
 (4082472, 1),
 (4058219, 1),
 (4058881, 1),
 (4067988, 1),
 (4066485, 1),
 (4087259, 1),
 (4090261,

In [28]:
from typing import List, Tuple
import numpy as np
import math

# each editor has a list of (article_id, count) of the things they edited
# assuming all id in list is unique
def calculate_editor_variance(editors : List[Tuple[int, int]]):
    article_ids, weights = zip(*editors)
    matrix = [get_embedding(article_id) for article_id in article_ids]
    return calculate_multid_distance(matrix, weights)

# get embedding of an article. either LDA or Cohere's dataset
# using Cohere rn
def get_embedding(article_id):
    if article_id not in df:
        print(f"Didn't find article {article_id}")
        return np.random.rand(COL)
    return df.loc[article_id , 'emb']

"""
Take in matrix of article x embedding and article's respective weight, return their variance in k-dimension
this is easier to interpret when using LDA (since you can see how far editors stray on certain topic, 
    assuming topic itself is indepedent)
With vaguer embedding, it's better to use the L2 norm and compress it to a single number
"""
def calculate_multid_variance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    variance = np.average((matrix-average)**2, weights=weights, axis=0)
    return np.sqrt(variance)

"""
Calculate L2 distance for each d-dimension point
"""
def calculate_multid_distance(matrix, weights):
    average = np.average(matrix, weights=weights, axis=0)
    l2 = np.linalg.norm(matrix-average, axis=1)
    return np.average(l2)

In [29]:
calculate_editor_variance(get_editors_list('Alaibot'))

Didn't find article 4058784
Didn't find article 4093691
Didn't find article 4056620
Didn't find article 4072072
Didn't find article 4048070
Didn't find article 4075498
Didn't find article 4066879
Didn't find article 4083613
Didn't find article 4073069
Didn't find article 4079113
Didn't find article 4049805
Didn't find article 4047561
Didn't find article 4091899
Didn't find article 4046974
Didn't find article 4093158
Didn't find article 4069224
Didn't find article 4069254
Didn't find article 4063484
Didn't find article 4069057
Didn't find article 4049203
Didn't find article 4067262
Didn't find article 4060538
Didn't find article 4092422
Didn't find article 4059097
Didn't find article 4059423
Didn't find article 4049833
Didn't find article 4072680
Didn't find article 4063762
Didn't find article 4074422
Didn't find article 4076967
Didn't find article 4072649
Didn't find article 4073583
Didn't find article 4047596
Didn't find article 4050255
Didn't find article 4094800
Didn't find article 

50.16994188162785