In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', 512)

import numpy as np
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import cudf 
import cuml 


### Embedding

In [3]:
embeddings = pickle.load( open( "../processed/jc_embeddings.pickle", "rb" ) )

jc = pd.read_csv("../processed/jc.csv")


## KNN

In [4]:
%%time 

KNN = 500
model = cuml.neighbors.NearestNeighbors(n_neighbors=KNN)
model.fit(embeddings)
distances, indices = model.kneighbors(embeddings)

CPU times: user 13.1 s, sys: 2.59 s, total: 15.7 s
Wall time: 17.7 s


In [5]:
with open('../processed/jc_knn_distances.pickle', 'wb') as handle:
    pickle.dump(distances, handle)

with open('../processed/jc_knn_indices.pickle', 'wb') as handle:
    pickle.dump(indices, handle)

In [6]:
distances.shape, indices.shape

((223549, 500), (223549, 500))

In [7]:
text = jc["text"].values
toxic = jc["toxic"].values
jc.shape

(223549, 10)

In [8]:
CUTOFF = 0.15
mm = distances[:,1]
idx = np.where( (mm<CUTOFF) )[0]

group_id = 0

d = {}
inv_d = {}
for i in idx:
    
    if i in inv_d:
        mark = inv_d[i]
    else:
        mark = i
        d[mark] = []
    
    for j in range(distances.shape[1]):
        if distances [i,j] < CUTOFF:
            k = indices[i,j]
            if k not in d[mark]: 
                d[mark].append ( k)
            inv_d [k] = mark
        else:
            break
group_ids = {} 
for n, i in enumerate(d):
    
    for j in d[i]:
        group_ids[j] = n

        
def find_intersection (d):
    count = 0
    for i in d:
        for k in d:
            if i <  k:
                s1=set(d[i])
                s2=set(d[k])
                if len(s1.intersection(s2))>0:
                    count += 1
    return count
        
def aggregate (d):
    d2 = {}
    founds = []
    for i in d:
        if i not in founds:
            d2[i] = d[i]
            founds.append(i)
            for k in d:
                if i <  k:
                    s1=set(d[i])
                    s2=set(d[k])
                    if len(s1.intersection(s2))>0:
                        d2[i] = d[i] + d[k]
                        founds.append(k)    
    return d2


In [9]:
count = find_intersection(d)
while count >0:
    print (count)
    d = aggregate (d)
    count = find_intersection(d)
    

155
7


In [10]:
group_ids = {}
for n, i in enumerate(d):
    l = len(d[i])
    for j in d[i]:
        group_ids[j] = -n


In [11]:
label_encoder = LabelEncoder()

jc["group_id"] = jc.index.map(lambda x: group_ids[x] if x in group_ids else np.nan )
jc["group_id"] = jc["group_id"].fillna(jc.index.to_series())
jc["group_id"] = label_encoder.fit_transform(jc["group_id"])


In [12]:
df_g = jc.groupby(["group_id"]).agg ({
    "id" :"count",
    "toxic":"std", 	"severe_toxic":"std", 	"obscene":"std", 	"threat":"std", 	"insult":"std", 	"identity_hate":"std",
}
).reset_index().rename(columns={"id":"total"})

df_g["std"] = df_g["severe_toxic"]+df_g["toxic"]+df_g["obscene"]+df_g["threat"]+df_g["insult"]+df_g["identity_hate"]

df_g = df_g.fillna(0)


df_g.sort_values(by="total")

Unnamed: 0,group_id,total,toxic,severe_toxic,obscene,threat,insult,identity_hate,std
110416,110416,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
147215,147215,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
147216,147216,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
147217,147217,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
147218,147218,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...
1005,1005,53,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
1093,1093,63,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
1144,1144,81,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000
1114,1114,89,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000


In [13]:
jc.query("group_id == 1093")

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,layer,toxic_flag,group_id
3424,09388d2885d48e4a,"""Thank you for experimenting with Wikipedia. Your test worked, and has been reverted or removed. Please use the sandbox for any other tests you want to do. Take a look at the welcome page if you would like to learn more about contributing to our encyclopedia. (talk) """,0,0,0,0,0,0,train,0,1093
11708,1ef657dce4444cea,"""Thank you for experimenting with Wikipedia. Your test worked, and it has been reverted or removed. Please use the sandbox for any other tests you may want to do. Take a look at the welcome page to learn more about contributing to our encyclopedia.-OnBrains """,0,0,0,0,0,0,train,0,1093
11979,1fb75c63e10ac4a0,"""\nThank you for experimenting with Wikipedia. Your test worked, and it has been reverted or removed. Please use the sandbox for any other tests you may want to do. Take a look at the welcome page to learn more about contributing to our encyclopedia. 64 """,0,0,0,0,0,0,train,0,1093
15158,28030dc1c2fdbb02,"""Thank you for experimenting with Wikipedia. Your test worked, and it has been reverted or removed. Please use the sandbox for any other tests you may want to do. Take a look at the welcome page to learn more about contributing to our encyclopedia. Talk Contribs """,0,0,0,0,0,0,train,0,1093
16459,2b5f71a01d2ac359,"""Thanks for experimenting with Wikipedia. Your test worked, and has been reverted or removed. Please use the sandbox for any other tests you want to do. Take a look at the welcome page if you would like to learn more about contributing to our encyclopedia. Thanks. — talk """,0,0,0,0,0,0,train,0,1093
...,...,...,...,...,...,...,...,...,...,...,...
210129,c982dc9ef8d2c017,"""Thanks for experimenting with Wikipedia. Your test worked, and has been removed or reverted. Please use the sandbox for any other tests you want to do. Take a look at the welcome page if you would like to learn more about contributing to our encyclopedia. Thanks. | @ | - """,0,0,0,0,0,0,test,0,1093
211727,d017c1e4ba4ff345,""" \n Thank you for experimenting with Wikipedia. Your test worked, and it has been reverted or removed. Please use the sandbox for any other tests you may want to do. Take a look at the welcome page to learn more about contributing to our encyclopedia. """,0,0,0,0,0,0,test,0,1093
215755,e0411784c5233526,"""Thank you for experimenting with Wikipedia. Your test worked, and it has been reverted or removed. Please use the sandbox for any other tests you may want to do. Take a look at the welcome page to learn more about contributing to our encyclopedia. - """,0,0,0,0,0,0,test,0,1093
222438,fb726cccf1d374b3,"""Thank you for experimenting with Wikipedia. Your test worked, and it has been reverted or removed. Please use the sandbox for any other tests you may want to do. Take a look at the welcome page to learn more about contributing to our encyclopedia. ( | contrib) """,0,0,0,0,0,0,test,0,1093


In [14]:
df_g.query("severe_toxic != 0.0").sort_values(by="total")

Unnamed: 0,group_id,total,toxic,severe_toxic,obscene,threat,insult,identity_hate,std
83,83,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
959,959,2,0.0,0.707107,0.0,0.0,0.0,0.707107,1.414214
937,937,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
832,832,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
721,721,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
669,669,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
646,646,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
463,463,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
1083,1083,2,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107
414,414,2,0.0,0.707107,0.0,0.0,0.707107,0.0,1.414214


In [15]:
jc.query("group_id == 959") 

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,layer,toxic_flag,group_id
12534,2134e802dccc7101,Trolling \n\nFacebook has been reported and is regullary trolled by Josh Ware of Nevada. This is because Josh Ware is a waste of life and a faggot who fucks his dad and can't figure anything else to do in his own existence then to perform these two hobbies continuously. Watch out for that guy. He's digustingly and sickeningly gay.,1,1,1,0,1,1,train,5,959
25278,42e5dbeb72e2e0b9,Trolling \n\nFacebook has been reported and is regullary trolled by Josh Ware of Nevada. This is because Josh Ware is a waste of life and a faggot who fucks his dad and can't figure anything else to do in his own existant then to perform these two hobbies continuously. Watch out for that guy. He's sickeningly gay.,1,0,1,0,1,0,train,3,959


In [16]:
jc.query("group_id == 83") 

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,layer,toxic_flag,group_id
163172,0ea8b5c8eb9037c1,"FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL...",1,1,1,1,1,0,test,5,83
196887,94e3118fe32d90aa,"FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL MUST DIE!FUK YOU, THE TRAITOR VANDAL...",1,0,1,1,1,0,test,4,83


In [17]:
df_g.query("toxic != 0.0").sort_values(by="total")

Unnamed: 0,group_id,total,toxic,severe_toxic,obscene,threat,insult,identity_hate,std
32,32,2,0.707107,0.0,0.707107,0.00000,0.707107,0.0,2.121320
578,578,2,0.707107,0.0,0.000000,0.00000,0.000000,0.0,0.707107
621,621,2,0.707107,0.0,0.000000,0.00000,0.000000,0.0,0.707107
627,627,2,0.707107,0.0,0.000000,0.00000,0.000000,0.0,0.707107
691,691,2,0.707107,0.0,0.000000,0.00000,0.000000,0.0,0.707107
...,...,...,...,...,...,...,...,...,...
715,715,3,0.577350,0.0,0.000000,0.57735,0.000000,0.0,1.154701
132,132,3,0.577350,0.0,0.000000,0.00000,0.000000,0.0,0.577350
481,481,4,0.577350,0.0,0.000000,0.00000,0.000000,0.0,0.577350
526,526,4,0.577350,0.0,0.000000,0.00000,0.000000,0.0,0.577350


In [18]:
jc.query("group_id == 1139 and toxic != 0")

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,layer,toxic_flag,group_id
152180,890db12df368a8a0,"Welcome Faggot!\n\nWelcome!\n\nHello, , and welcome to Wikipedia! Thank you for your contributions. I hope you like the place and decide to stay. Here are a few good links for newcomers:\nThe five pillars of Wikipedia\nHow to edit a page\nHelp pages\nTutorial\nHow to write a great article\nManual of Style\nI hope you enjoy editing here and being a Wikipedian! Please sign your name on talk pages using four tildes (~~~~); this will automatically produce your name and the date. If you need help, check out ...",1,0,1,0,1,0,train,3,1139


In [19]:
jc.to_csv("../processed/jc.csv", index=False)

In [20]:
jc.head()

Unnamed: 0,id,text,toxic,severe_toxic,obscene,threat,insult,identity_hate,layer,toxic_flag,group_id
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0,train,0,1150
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0,train,0,1151
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0,train,0,1152
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be ...",0,0,0,0,0,0,train,0,1153
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0,train,0,1154
