In [1]:
import gc
import pathlib
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable, Any
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2",
        #"paraphrase-MiniLM-L12-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L12-v2",
        #"paraphrase-mpnet-base-v2": f"{pretrained_dir}sentence-transformers/paraphrase-mpnet-base-v2",
        #"msmarco-MiniLM-L-6-v3": f"{pretrained_dir}sentence-transformers/msmarco-MiniLM-L-6-v3",
    }
    index_file: str = "output/ruddit.index"
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', em_max_seq_length=128, em_batch_size=1000, em_models={'paraphrase-MiniLM-L6-v2': 'pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2'}, index_file='output/ruddit.index')
device=0, NVIDIA GeForce GTX 1060 6GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [3]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

# Ruddit dataset
- Seed dataset with BWS label
- Generate embeddings
- Build index

In [4]:
df = pd.read_parquet("input/pre_ruddit.parquet")
bws = list(df["bws"])
ruddit_text = list(df["text2"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
 6   text3   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(4)
memory usage: 228.7+ KB


In [5]:
%%time
for name, directory in conf.em_models.items():
    print(name)
    model = SentenceTransformer(directory, device=conf.device)
    model.max_seq_length = conf.em_max_seq_length
    em = model.encode(sentences=ruddit_text, 
                      batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
    print(f"em.shape={em.shape}")
    faiss.normalize_L2(em)
    cols = [f"{name}_{i:04d}" for i in range(em.shape[1])]
    df[cols] = em
    df[cols] = df[cols].astype(np.float32)
df.to_parquet("output/em_ruddit.parquet", index=False)

paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:09,564 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:09,564 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:09,564 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:09,564 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

em.shape=(5710, 384)


  self[col] = igetitem(value, i)


Wall time: 7.97 s


In [6]:
%%time
d = em.shape[1]
nlist = 1000
m = 8  # number of subquantizers
quantizer = faiss.IndexFlatIP(d)  # this remains the same
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
# 8 specifies that each sub-vector is encoded as 8 bits
index.verbose = True
index.train(em)
index.add(em)

Wall time: 765 ms


In [7]:
%%time
faiss.write_index(index, conf.index_file)

Wall time: 1 ms


In [8]:
%%time
index = faiss.read_index(conf.index_file)
print(f"ntotal={index.ntotal}, is_trained={index.is_trained}")

ntotal=5710, is_trained=True
Wall time: 27 ms


In [9]:
index.nprobe = 1
k = 4
distances, ids = index.search(em[:20], k)  # sanity check
print(f"I={repr(ids)}\nD={repr(distances)}")

I=array([[   0,    5,    4, 2886],
       [   1, 4966, 3859, 1648],
       [   2,    7, 3918,   16],
       [   3,   23, 3783,  881],
       [   4,   31, 3903,   10],
       [   5,   10, 2694, 3903],
       [   6, 1818,  316,   -1],
       [   7,    2,   16, 3905],
       [   8,   12, 1693,   18],
       [   9,  563, 2454, 2451],
       [  10,    4, 2694,    5],
       [  11,   16,    7,    2],
       [  12,    8, 2890,   18],
       [  13, 2443, 5021, 5626],
       [  14,   26,   33, 3913],
       [  15, 3135, 3515, 3130],
       [  16,   11,    7,    2],
       [  17,   19,   30, 3917],
       [  18,    8,  194, 3914],
       [  19,   30, 3917, 1446]], dtype=int64)
D=array([[4.3200651e-01, 6.8623567e-01, 6.9459498e-01, 7.3324448e-01],
       [4.5601249e-01, 8.4324878e-01, 8.6190856e-01, 9.0924489e-01],
       [2.2245906e-01, 2.9274365e-01, 4.3219829e-01, 4.6194431e-01],
       [3.1575781e-01, 5.9827942e-01, 6.0513955e-01, 6.4260429e-01],
       [2.5046504e-01, 4.3624276e-01, 4.394050

# Validation dataset
- Estimate BWS label based on kNN similarity search

In [10]:
df = pd.read_parquet("input/pre_val.parquet")
val_text = list(df["text"])
val_text2 = list(df["text2"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
 3   text3   14251 non-null  object
dtypes: object(4)
memory usage: 445.5+ KB


In [11]:
%%time
for name, directory in conf.em_models.items():
    print(name)
    model = SentenceTransformer(directory, device=conf.device)
    model.max_seq_length = conf.em_max_seq_length
    em = model.encode(sentences=val_text2, 
                      batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
    print(f"em.shape={em.shape}")
    faiss.normalize_L2(em)
    cols = [f"{name}_{i:04d}" for i in range(em.shape[1])]
    df[cols] = em
    df[cols] = df[cols].astype(np.float32)
df.to_parquet("output/em_val.parquet", index=False)

paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:18,459 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:18,459 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:18,459 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-06 09:15:18,459 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)
Wall time: 18.2 s


In [12]:
k = 4
distances, ids = index.search(em, k)  # sanity check
print(f"I={repr(ids[:20])}\nD={repr(distances[:20])}")

I=array([[5052, 3384, 4340, 5687],
       [4412,  918, 5128,   -1],
       [5452,   -1,   -1,   -1],
       [5142,  289, 5094, 2466],
       [3327, 1382, 1982, 4337],
       [1383, 1392, 1358,   -1],
       [5685, 5276, 4514, 5354],
       [1639, 4134, 5075, 5177],
       [3907,  174,  133,  175],
       [5009, 3867, 3359,   -1],
       [3525, 4552, 2037, 3084],
       [5349, 5276, 4514, 5354],
       [5584, 5300,   -1,   -1],
       [4876, 5263, 4947, 5264],
       [3027, 4767, 3300, 2987],
       [5622, 2576, 5135, 4305],
       [5348, 4962, 5624, 5076],
       [3636,   -1,   -1,   -1],
       [4861, 2227, 4959, 4453],
       [5584, 5300,   -1,   -1]], dtype=int64)
D=array([[8.7736386e-01, 8.8288343e-01, 8.9030898e-01, 9.1270185e-01],
       [1.2094778e+00, 1.2513462e+00, 1.3002568e+00, 3.4028235e+38],
       [1.3297167e+00, 3.4028235e+38, 3.4028235e+38, 3.4028235e+38],
       [9.1943246e-01, 9.2947692e-01, 9.3595111e-01, 1.0086160e+00],
       [9.6102434e-01, 9.8325926e-01, 9.955806

In [13]:
for i in range(ids.shape[0]):
    print(f"=============\ni={i}, {val_text2[i]}")
    for j in range(ids.shape[1]):
        if ids[i][j] == -1:
            continue
        print(f"j={j}, {ruddit_text[ids[i][j]]}")

i=0, " Go rot in hell you evil liittle bastard. " "
j=0, [Wtf; what the fuck; exclamation of surprise or disbelief] is this skeletal demon doing, doing anything for America?
j=1, Fuck any so-called *Pro-Life* person who is silent during this fiasco. Worthless pieces of hypocritical shit as far as I am concerned.
j=2, Fucking slaughtered. I want this [dude; friend] to give a speech.
j=3, Stay off the sauce. Really. you are just talking gibberish. He is a racist fucktard, and that is is obvious and out in plain sight both in his speech and his action. Who he has in his cabinet is no defense for his actions or what he is said. it is utterly irrelevant. &#x200B; But I can see you are a simple man and of weak constitution, so I would rather you just lay off the sauce.
i=1, Bamafuck has to be stopped. Please help me keep the Styx pages integ intact!
j=0, This Togo erasure and I will not stand for it.
j=1, If it was unfixable and causing a lot of suffering to the animal, I would in fact find 

j=3, Holy fuck I did not realize Benny here could go THIS low, I thought he was just a racist shitbag, not a complete and utter moron.
i=385, So funny. Not really. I thought liberals did not care about what someone did in bed (remember Bill and Monica?). Seriously, he probably believed Haggard's original statements of not doing any of said accusations, and then once Haggard repented and admitted to his sins (homosexuality, drug use, lying, adultery, prostitution) Dobson obviously did not condone those actions. Liberals love to see people struggle with sin. Iamvery
j=0, Just a word man, your looking to much into it. Just a name we made up of to make fun of liberals
j=1, that is like saying, "Men of Reddit, while your ladies are not around, what are your thoughts on the Kennedy assination and 9/11?" what is with these random [af; as fuck; expression of annoyance, contempt or impatience] AskReddit questions?
j=2, No source other than cringey op-eds that lack any quotes, just liberals bein

j=3, Under what ill serving definition? Encouraging someone boycott businesses with bad practices is "extremism"? Do you even know what the word means? An extremist would say to burn down their offices, or kill members of the company. I am simply saying do not do business with them. Do you even care what the words mean that you use or are you just attempting to vilify me so you do not have to do the work of actually thinking for yourself?
i=437, Well they are certainly prettier than you Siddiqui ugly faces. I suggest you keep it that way. Certainly gives a much better impression then you miserable Siddiquis. As far as I can see you are the biggest intellectual dishonest here judging by all the comments on your talk.
j=0, I obviously know that. It is a satire because "these" (antivaxxers) people actually do write like that.
j=1, I thought i saw more comments/threads from you few months ago than nowadays. Well its depressing...
j=2, I mean, going from a few hundred likes per post to thou

j=1, Wasnt it a veteran team member that suggested Collin kneel instead of sit out the anthem out of respect for vets?
j=2, Forgiven will be packing his stuff for the army after the series. Lets go #REKKINCREW
i=1043, JamesBWatson has been vandalizing posts JamesBWatson has been vandalizing post for Olive amongst many others and is trying to shift blame on others simply because of his/hers ignorance and bigotry. One simple sentence which was added by me to the post in question is proven fact, there is also a link to page that would confirm it is accuracy and it is also relevant to the post. I have continually tried to undo vandalism which was commited by JamesBWatson and some other users, without resorting to accusations and name calling, but this user has obvioulsy taken law in his/hers hands and thinks that can bully other people on this public domain. I have even moved my addition to the topic to another part of the article but to no avail it has been vandalized again. Please see al

j=1, "It is harder for a camel to pass through the eye of a needle than a rich man to enter heaven" "Do unto others as you would have others do unto you" Does that sound like most Christian fundamentalists you have seen in the US?
j=2, "he is a borderline rapist and that is just what we know about. He cannot possibly win -- did you hear what he said about grabbing pussy? ZERO percent chance. he is a fucking trainwreck." Me. Before losing all faith in humanity.
j=3, "YOU WANMA FUCK WITH ME?" UMMM... HE IS CLEARLY SAYING NO.
i=1126, Verbal must be stopped Anything you can do to rein Verbal's rampant ego would be much appreciated. He has decimated at least one page I know of with domineering tactics. he is an ass.
j=0, We should never get used to it. It deserves to get mocked all the way until he is removed one easy or the other.
j=1, I agree with you but that last sentence seems dangerous man there is a lot more going on in media suppression
j=2, Comments like this should be criminalized

j=2, > it is gotten so bad that now he is starting to get [pissed; to be drunk or made very annoyed] at her, saying that she is being leading him on all this time This ironic considering that he was the one who was leading her on to think that they are only friends.
j=3, Now he is going to think she is pregnant every time they get pulled over
i=1740, I am back Yes I am back, but I will not do anything stupid you ass-[muppet; incompetent or foolish person].
j=0, Fuck, you are hilarious, but you are also a dick. But do not worry, I have learned from my early mistakes, I know my limits. There is no "Should have done" though, hindsight does not roll in the real world. Young me was a [twat; profanity for a woman's vagina], older me, less so.
j=1, And you do not have to be a total dickbag to other people to be funny.
j=2, [Dude; friend] you just fucking won!!
j=3, fuck you, you are just pretending to be blocked. get back to work RIGHT NOW you piece of shit piping
i=1741, Why did you fuck my 

j=0, I think every 6 string song on all of periphery's discography is in drop c and they are all challenging
j=1, A lot of awesome suggestions in this thread. If your top strings can handle it, tuning to C standard will allow you play most of The Sword's albums. Their Age of Winters album is loads of fun to play and not too bad to pick up by ear!
j=2, you could just transpose drop D songs.
j=3, Quite a few of their newer songs are in Drop B if you want to mess around in that tuning as well.
i=1783, Kubigula and you did destroy I am reallya angry with you because you did put speedy delete tag YOU ARE DESTROYING MY WORK Thank you Kubigula DID Delete my work which i start i HATE YOU Kubigula and Neil916 IT REALLY NOT FAIR YOU ARE RUDE TOO ME stupid head from Aiden2009
j=0, I got banned for subbing to T_D so fuck you [m8; greeting hello] ill fawkin knock ya upside the head if ya want to talk a da trash
j=1, Ok so maaaybe I got banned for posting a link to the wikipedia page for Treblinka, 

j=1, that is why my [gf; girlfriend] and I yell "naked race!" and see who can strip off their clothes first.
j=2, Those fucks that constantly annoy you by giving you all these tips and suggestions every other day and criticise your workout routine. That one guy who comes to the gym just to flirt with girls and sits on the machines. FUCK YOU MOTHERFUCKER I want to DO SOME ACTUAL WORKOUT. Sweaty fucks who do not wipe after they are done
i=2406, Could idiotic self-righteous people please refrain from editing or deleting my talk page in future. You may not like what has been written on it, but is not a Wikipedia article, it is a talk page, so you would do well to leave it alone. Thankyou.
j=0, Did you read what I wrote? The story is about Germany, I live in Denmark and we have a problem with objectively false stories being spread as the truth and people getting up in arms about it. You talk about deciding what is real news and that nobody should be able to tell the media what they can publ

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




i=11465, HI JohnHistory Yes, you will be blocked and your thoughtful post here deleted by someone like Tarc. The main thing to realize is that Wikipedia has always been, and will always be, a clusterfuck. If you are interested in finding out detailed histories of the lives of anime characters, it is a start, but other than that, it is virtual pages are only good for wiping virtual shit out of your virtual asshole. This place attracts the worst kind of assholes, and it is hard to trust in the good faith of something from which nothing but shit and hot air spews forth. They decide what a proper source is, they decide what is been discussed in the media, just like they decide which place in SF has the best glory holes. Any type of internet arrangement with a hierarchy attracts the worst type of vaginal discharge. Wikipedia's political content is governed by a bunch of aborted leftists who read a couple of pages of Das Kapital, decided it was nice and then put it down. They are a bunch of

i=12091, Propol is just a little coward that tries to block anyone that dares to disagree with his DNC talking points. Everything Propol tries to edit from now on will be reverted to its previous edit.
j=0, I am trying to make a nuanced point here.. not sure if I can dumb it down any more. I am trying to say, when you call them supporters, it implies they are supporting him out of normal political reasons, and that if he fails to live up to his promises they will abandon him. I am arguing that we stop calling them "supporters" because that normalizes fanaticism and personality cults. And no, Red Guards were not merely "Mao supporters" unless you are disingenuously applying the very broadest use of the word.
j=1, I hate doing it cause it is usually a cheap way that people try to win arguments by invalidating others just because they post in certain subs. However, in this case I think it is completely appropriate because it points out the hypocrisy of saying that Obama ignored the consti

j=1, Oh look, the daily mail...fuck off.
j=2, This fucking fake and you are all buying it [Lmfao; Laughing my fucking ass off]
j=3, > it is barely even humor All this "[LOL; laughing out loud] [DAE; Does Anybody Else] TRIGGERED APACHE HELICOPTER [SJW; social justice warrior; slur for a person with socially progressive, left-wing and liberal views]" shit that Reddit has just been upvoting madly has to be one of the least funny jokes I have ever seen on this website, and that is without the smugness of it taken into account. Just looking at it objectively as humor it is tired, low-effort, and boring.
i=12728, " Your sarcasm Hello Angr, I think that your recent addition to a user talk page of ""Your efforts to get yourself blocked from editing Wikipedia have been successful! If the block expires, you can come back to make constructive edits"" was frankly stupid. You people are trying to promote yourselves as the Wikipedia elite, and believe me it does not help your cause by writing garbag

j=2, advancement of religion can include such activities as hosting AA meetings. these are nominally religious, as they invoke a "higher power," and are charitable (provide a service), and advance some notion of religion.
j=3, Religion is protected by the first amendment in the same way that gun ownership is protected by the second. >Taxing religious institutions does not infringe on my right to practice religion any more than taxing the sale of firearms infringes on my right to own a gun. This analogy is inaccurate. A better analogy would be that taxing the operation costs of religious institutions would be similar to taxing the ownership of a firearm on annual basis, so as to disincentivize ownership (or religious activity) by adding an additional layer of financial obligation.
i=13344, - From someone that you totally wronged with your bullshit and managed to get me to stop contributing very significant contributions to this site nearly a year ago now. (And I contributed a LOT in a s

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
preds = {}
d_max = 2
for i in tqdm(range(ids.shape[0])):
    v = 0
    sims = []
    for j in range(ids.shape[1]):
        if ids[i][j] == -1 or distances[i][j] >= 1:
            break
        d_norm = distances[i][j] / d_max
        sims.append(1 - d_norm)
    sm = sum(sims)
    for j, s in enumerate(sims):
        v += s / sm * bws[ids[i][j]] 
    preds[val_text[i]] = v

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 62174.65it/s]


In [15]:
vdf = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
vdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB


In [16]:
score = mylib.comp_metric(preds, validation_data=vdf)
print(f"Average Agreement with Annotators={score:.4f}")

Average Agreement with Annotators=0.5638
