In [1]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
%%time
df = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB
Wall time: 225 ms


In [4]:
texts = set(df["less_toxic"]) | set(df["more_toxic"])
texts = list(texts)
df = pd.DataFrame(data={"text": texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
dtypes: object(1)
memory usage: 111.5+ KB


# Preprocess Text
Speed dropped from 1200 to 1000 it/s

In [5]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


def pre3(row) -> str:
    return mylib.pre3(row["text2"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)
col = "text3"
print(col)
df[col] = df.progress_apply(pre3, axis=1)

text1


100%|████████████████████████████████████████| 14251/14251 [00:50<00:00, 283.77it/s]


text2


100%|█████████████████████████████████████████| 14251/14251 [14:01<00:00, 16.94it/s]


text3


100%|█████████████████████████████████████████| 14251/14251 [03:15<00:00, 72.81it/s]


In [6]:
cols = ["text", "text1", "text2", "text3"]
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2,text3
8746,Jeff G. ツ likes to rape goats while illegal aliens choke him and shove broken glass in his ass.,Jeff G. likes to rape goats while illegal aliens choke him and shove broken glass in his ass.,Jeff G. likes to rape goats while illegal aliens choke him and shove broken glass in his ass.,jeff g like to rape goat while illegal alien choke he and shove broken glass in his ass
5149,"""\n\n Your edit to C. \n\n Please stop. If you continue to vandalize Wikipedia, you will be blocked from editing. · Talk """,""" Your edit to C. Please stop. If you continue to vandalize Wikipedia, you will be blocked from editing. Talk """,""" Your edit to C. Please stop. If you continue to vandalize Wikipedia, you will be blocked from editing. Talk """,your edit to c please stop if you continue to vandalize wikipedia you will be block from edit talk
10696,"I wish it were that simple Sidd. I tried appealing to wafulz who seemed to not understand or care about what was going on. I think for the most part he is the overseer of the page. I am sorry if I barked at you. I have been blindsided by ignorant people here from the beginning. I am interested in the 'truth' of this as you seem to be. You are probably right about getting someones attention about Technocrate. He did these same exact reverts when I first arrived then disappeared for a while. He has never creatively done any thing.\n Would you mind doing it, blowing the whistle, or bringing it to the attention of wafulz, or better yet a different editor admin. I would appreciate that. He does not seem disposed toward me. I have tried many times to engage him.","I wish it were that simple Sidd. I tried appealing to wafulz who seemed to not understand or care about what was going on. I think for the most part he is the overseer of the page. I am sorry if I barked at you. I have been blindsided by ignorant people here from the beginning. I am interested in the 'truth' of this as you seem to be. You are probably right about getting someones attention about Technocrate. He did these same exact reverts when I first arrived then disappeared for a while. He has never creatively done any thing. Would you mind doing it, blowing the whistle, or bringing it to the attention of wafulz, or better yet a different editor admin. I would appreciate that. He does not seem disposed toward me. I have tried many times to engage him.","I wish it were that simple Sidd. I tried appealing to wafulz who seemed to not understand or care about what was going on. I think for the most part he is the overseer of the page. I am sorry if I barked at you. I have been blindsided by ignorant people here from the beginning. I am interested in the 'truth' of this as you seem to be. You are probably right about getting someones attention about Technocrate. He did these same exact reverts when I first arrived then disappeared for a while. He has never creatively done any thing. Would you mind doing it, blowing the whistle, or bring it to the attention of wafulz, or better yet a different editor admin. I would appreciate that. He does not seem disposed toward me. I have tried many times to engage him.",i wish it be that simple sidd i try appeal to wafulz who seem to not understand or care about what be go on i think for the most part he be the overseer of the page i be sorry if i bark at you i have be blindside by ignorant people here from the beginning i be interested in the truth of this as you seem to be you be probably right about get someone attention about technocrate he do these same exact revert when i first arrive then disappear for a while he have never creatively do any thing would you mind do it blow the whistle or bring it to the attention of wafulz or well yet a different editor admin i would appreciate that he do not seem dispose toward i i have try many time to engage he
6412,"I am very surprised and have to say that I think such action is both unnecessary and is being ridiculous. Having just read the complaint, I think it's very extreme to start calling me a meatpuppet. This is totally contrary to your other welcome messages earlier. I think you have some serious issues and trying to reignite and fan the flames isn't going to help. I can keep LK under control to get this article finished, but you do seem to have a very worrying issue about one small matter that he wanted included. Considering the amount of trouble that I have now gone to, to help settle this article, it would appear that you are now just absolutely intent on trying to score personal points against him. That does not give anyone a good opinion as to your real intentions here. Is there another agenda here? It seems that despite Alf's comments and other offers to resolve the actual problem, you seem intent on going behind our backs anyway to stir up a full scale war. Very disapponted.","I am very surprised and have to say that I think such action is both unnecessary and is being ridiculous. Having just read the complaint, I think it's very extreme to start calling me a meatpuppet. This is totally contrary to your other welcome messages earlier. I think you have some serious issues and trying to reignite and fan the flames isn't going to help. I can keep LK under control to get this article finished, but you do seem to have a very worrying issue about one small matter that he wanted included. Considering the amount of trouble that I have now gone to, to help settle this article, it would appear that you are now just absolutely intent on trying to score personal points against him. That does not give anyone a good opinion as to your real intentions here. Is there another agenda here? It seems that despite Alf's comments and other offers to resolve the actual problem, you seem intent on going behind our backs anyway to stir up a full scale war. Very disapponted.","I am very surprised and have to say that I think such action is both unnecessary and is being ridiculous. Having just read the complaint, I think it is very extreme to start calling me a meatpuppet. This totally contrary to your other welcome messages earlier. I think you have some serious issues and trying to reignite and fan the flames is not going to help. I can keep LK under control to get this article finished, but you do seem to have a very worrying issue about one small matter that he wanted included. Considering the amount of trouble that I have now gone to, to help settle this article, it would appear that you are now just absolutely intent on trying to score personal points against him. That does not give anyone a good opinion as to your real intentions here. Is there another agenda here? It seems that despite Alf's comments and other offers to resolve the actual problem, you seem intent on going behind our backs anyway to stir up a full scale war. Very disapponted.",i be very surprised and have to say that i think such action be both unnecessary and be be ridiculous having just read the complaint i think it be very extreme to start call i a meatpuppet this totally contrary to your other welcome message early i think you have some serious issue and try to reignite and fan the flame be not go to help i can keep lk under control to get this article finish but you do seem to have a very worrying issue about one small matter that he wanted include consider the amount of trouble that i have now go to to help settle this article it would appear that you be now just absolutely intent on try to score personal point against he that do not give anyone a good opinion as to your real intention here be there another agenda here it seem that despite alf s comment and other offer to resolve the actual problem you seem intent on go behind our back anyway to stir up a full scale war very disapponted
875,"(or, of course, you could try and recreate them from scratch with good sourcing, but that would be silly when userification is available)","(or, of course, you could try and recreate them from scratch with good sourcing, but that would be silly when userification is available)","(or, of course, you could try and recreate them from scratch with good sourcing, but that would be silly when userification is available)",or of course you could try and recreate they from scratch with good sourcing but that would be silly when userification be available
4432,"Urgent complain - Wikipedia \n\nAnswer given by you shows that you and whoever associated with WikipediaBold text don't care for the authenticity of the content displayed to the public. It's unfair and very disgusting that you folks are not taking this article or others linked to it directly or indirectly false. Give me your instant contact so that I could in detail talk to you. Because as memeber of your site, it's a right to ask you what the hell is going on.","Urgent complain - Wikipedia Answer given by you shows that you and whoever associated with WikipediaBold text don't care for the authenticity of the content displayed to the public. It's unfair and very disgusting that you folks are not taking this article or others linked to it directly or indirectly false. Give me your instant contact so that I could in detail talk to you. Because as memeber of your site, it's a right to ask you what the hell is going on.","Urgent complain - Wikipedia Answer given by you shows that you and whoever associated with WikipediaBold text do not care for the authenticity of the content displayed to the public. it is unfair and very disgusting that you folks are not taking this article or others linked to it directly or indirectly false. Give me your instant contact so that I could in detail talk to you. Because as memeber of your site, it is a right to ask you what the hell is going on.",urgent complain wikipedia answer give by you show that you and whoever associate with wikipediabold text do not care for the authenticity of the content display to the public it be unfair and very disgusting that you folk be not take this article or other link to it directly or indirectly false give i your instant contact so that i could in detail talk to you because as memeber of your site it be a right to ask you what the hell be go on
5422,"What the hell is this, I was asleep at 01:06 AM. Any chance I can contest this on the basis it looks like my account was hacked?","What the hell is this, I was asleep at 01:06 AM. Any chance I can contest this on the basis it looks like my account was hacked?","What the hell is this, I was asleep at 01:06 AM. Any chance I can contest this on the basis it looks like my account was hacked?",what the hell be this i be asleep at 01 06 am any chance i can contest this on the basis it look like my account be hack
10969,"Don't hide the incriminating photographs you Nazi Americans!!!!!! \n\nIf the photos are graphic its because the horrible deeds committed by the deranges,sadistic perverted soldiers are also equally disturing.I think the Yanks should stop trying to cover up these issues.","Don't hide the incriminating photographs you Nazi Americans!!!!!! If the photos are graphic its because the horrible deeds committed by the deranges,sadistic perverted soldiers are also equally disturing.I think the Yanks should stop trying to cover up these issues.","do not hide the incriminating photographs you Nazi Americans!!! If the photos are graphic its because the horrible deeds committed by the deranges,sadistic perverted soldiers are also equally disturing.I think the Yanks should stop trying to cover up these issues.",do not hide the incriminate photograph you nazi americans if the photo be graphic its because the horrible deed commit by the derange sadistic perverted soldier be also equally disture i think the yanks should stop try to cover up these issue
12307,"Where on earth was there offensive content directed towards or a personal attack on IrishGuy?? I appreciate you don't like my shenanigans, but there was nothing rude about them.","Where on earth was there offensive content directed towards or a personal attack on IrishGuy?? I appreciate you don't like my shenanigans, but there was nothing rude about them.","Where on earth was there offensive content directed towards or a personal attack on IrishGuy?? I appreciate you do not like my shenanigans, but there was nothing rude about them.",where on earth be there offensive content direct towards or a personal attack on irishguy i appreciate you do not like my shenanigan but there be nothing rude about they
13513,"""\nThat book is already cited for something else anyway. And really, do we have the authority to say to Jan Smaczny """"Tsk, tsk, tsk, you're so POVed?"""" Who the heck are we? Our best representative is someone who supposedly plays the piano yet knows next to nothing about music. Jan Smaczny, on the other hand, is a musician and a musicologist who has extensively studied Slavic music. If someone has earned the right to a POV on topics musical, it's him. """,""" That book is already cited for something else anyway. And really, do we have the authority to say to Jan Smaczny """"Tsk, tsk, tsk, you're so POVed?"""" Who the heck are we? Our best representative is someone who supposedly plays the piano yet knows next to nothing about music. Jan Smaczny, on the other hand, is a musician and a musicologist who has extensively studied Slavic music. If someone has earned the right to a POV on topics musical, it's him. """,""" That book is already cited for something else anyway. And really, do we have the authority to say to Jan Smaczny """"[Tsk; to utter a clicking sound; expression of disapproval], you are so POVed?"""" Who the heck are we? Our best representative is someone who supposedly plays the piano yet knows next to nothing about music. Jan Smaczny, on the other hand, is a musician and a musicologist who has extensively studied Slavic music. If someone has earned the right to a [POV; point of view] on topics musical, it is him. """,that book be already cite for something else anyway and really do we have the authority to say to jan smaczny tsk to utter a click sound expression of disapproval you be so poved who the heck be we our good representative be someone who supposedly play the piano yet know next to nothing about music jan smaczny on the other hand be a musician and a musicologist who have extensively study slavic music if someone have earn the right to a pov point of view on topic musical it be he


In [7]:
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2,text3
1316,poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. \n\npoop. \npoop. \npoop.,poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop. poop.,[poop; faeces]. [poop; faeces].,poop faece poop faece
9766,"What do you mean AndreaFox2?! Tito wasn't important ?! How could you say such a thing?? Do you know how many important people came to his funeral! Do you know that all of yugoslavia cried when tito croaked ? Hahahaha. It's great that you've noticed that no, in fact, tito is nobody and nothing but a DEMocidal totalitarian communist dictator. To top it off he was a locksmith by trade probably learned to read shortly after the war. He was a a puppet in regards to geopolitical relations between the east and west during the cold war years - his legacy and his psychopathic bloodthirsty trail ended with his sick national experiment which will never ever ever ever return ever. In the end, if people DID care - they would realize that this entire article which DIREKTOR has a WN:OWN on is a disgrace to the western world, freedom and democracy.","What do you mean AndreaFox2?! Tito wasn't important ?! How could you say such a thing?? Do you know how many important people came to his funeral! Do you know that all of yugoslavia cried when tito croaked ? Hahahaha. It's great that you've noticed that no, in fact, tito is nobody and nothing but a DEMocidal totalitarian communist dictator. To top it off he was a locksmith by trade probably learned to read shortly after the war. He was a a puppet in regards to geopolitical relations between the east and west during the cold war years - his legacy and his psychopathic bloodthirsty trail ended with his sick national experiment which will never ever ever ever return ever. In the end, if people DID care - they would realize that this entire article which DIREKTOR has a WN:OWN on is a disgrace to the western world, freedom and democracy.","What do you mean AndreaFox2?! Tito was not important ?! How could you say such a thing?? Do you know how many important people came to his funeral! Do you know that all of yugoslavia cried when tito croaked ? Haha. it is great that you have noticed that no, in fact, tito is nobody and nothing but a DEMocidal totalitarian communist dictator. Top it off he was a locksmith by trade probably learned to read shortly after the war. He was a a puppet in regards to geopolitical relations between the east and west during the cold war years - his legacy and his psychopathic bloodthirsty trail ended with his sick national experiment which will never return ever. In the end, if people DID care - they would realize that this entire article which DIREKTOR has a WN:OWN on is a disgrace to the western world, freedom and democracy.",what do you mean andreafox2 tito be not important how could you say such a thing do you know how many important people come to his funeral do you know that all of yugoslavia cry when tito croak haha it be great that you have notice that no in fact tito be nobody and nothing but a democidal totalitarian communist dictator top it off he be a locksmith by trade probably learn to read shortly after the war he be a a puppet in regard to geopolitical relation between the east and west during the cold war year his legacy and his psychopathic bloodthirsty trail end with his sick national experiment which will never return ever in the end if people do care they would realize that this entire article which direktor have a wn own on be a disgrace to the western world freedom and democracy
5577,"Hit 3 times for the same crime \n\nso I get 2 warnings and a block for the second time I called out a VANDAL. \n\nIf the idiots weren't constantly vandalizing articles, removing factual information on a whim, none of this would be taking place. As I said elsewhere, wikipedia has gotten a very bad reputation so just such whimsical reverting as the two idiots were constantly doing. \n\nAnd going by the things I've read about all of you, no doubt you'll now block me from posting even on my own page. \n\nEither re-instate me or block the two idiots as well. No matter what, I EXPECT someone to have a talk with those two about vandalizing other people's factual information.\n\n99.139.224.87","Hit 3 times for the same crime so I get 2 warnings and a block for the second time I called out a VANDAL. If the idiots weren't constantly vandalizing articles, removing factual information on a whim, none of this would be taking place. As I said elsewhere, wikipedia has gotten a very bad reputation so just such whimsical reverting as the two idiots were constantly doing. And going by the things I've read about all of you, no doubt you'll now block me from posting even on my own page. Either re-instate me or block the two idiots as well. No matter what, I E [Tongue sticking out, cheeky, playful or blowing a raspberry] ECT someone to have a talk with those two about vandalizing other people's factual information.","Hit 3 times for the same crime so I get 2 warnings and a block for the second time I called out a VANDAL. If the idiots were not constantly vandalizing articles, removing factual information a whim, none of this would be taking place. As I said elsewhere, wikipedia has gotten a very bad reputation so just such whimsical reverting as the two idiots were constantly doing. And going by the things I have read about all of you, no doubt you will now block me from posting even on my own page. Either re-instate me or block the two idiots as well. No matter what, I E [Tongue sticking out, cheeky, playful or blowing a raspberry] ECT someone to have a talk with those two about vandalizing other people's factual information.",hit 3 time for the same crime so i get 2 warning and a block for the second time i call out a vandal if the idiot be not constantly vandalize article remove factual information a whim none of this would be take place as i say elsewhere wikipedia have get a very bad reputation so just such whimsical reverting as the two idiot be constantly do and go by the thing i have read about all of you no doubt you will now block i from post even on my own page either re instate i or block the two idiot as well no matter what i e tongue stick out cheeky playful or blow a raspberry ect someone to have a talk with those two about vandalize other people s factual information
8283,"You're a smug, disruptive asshole. I can't believe you're monitoring my talk and userpage so closely as to be replying 10 minutes after I do.\n","You're a smug, disruptive asshole. I can't believe you're monitoring my talk and userpage so closely as to be replying 10 minutes after I do.","you are a smug, disruptive asshole. I cannot believe you are monitoring my talk and userpage so closely as to be replying 10 minutes after I do.",you be a smug disruptive asshole i can not believe you be monitor my talk and userpage so closely as to be reply 10 minute after i do
739,I am wrecking your Wikipedia. Do something.,I am wrecking your Wikipedia. Do something.,I am wrecking your Wikipedia. Do something.,i be wreck your wikipedia do something
10250,AIDS \n\nLess AIDS would be spread if you were to stop inserting your minuscule penis into little boys.,AIDS Less AIDS would be spread if you were to stop inserting your minuscule penis into little boys.,AIDS Less AIDS would be spread if you were to stop inserting your minuscule penis into little boys.,aids less aids would be spread if you be to stop insert your minuscule penis into little boy
2,"Image tagging \nLook mate, what IS your problem with where images come from? They make wikipedia a better place, regardless of source (and for the record I think that ALL of my images qualify as fair use). You're actually working against everything that is good about wikipedia, systematically going after people and questioning everything they do. Who told you to do this? Why take the 'law' into your own hands? Ask yourself this: who are you protecting? You wanna play cops and robbers, go and joing the freaking police. Do something useful, rather than wasting your life on here. What is your job? Cos this ain't real life buddy, and if its your only hobby, I feel bad for you. I mean really bad. I thinks it's quite sad that your only reason for using wikipedia is to actively reverse what other people do, rather than to actively create content. I will review all the images you 'helpfully' tagged for me, but I'm getting real tired of you ducking me man. Peace out.","Image tagging Look mate, what IS your problem with where images come from? They make wikipedia a better place, regardless of source (and for the record I think that ALL of my images qualify as fair use). You're actually working against everything that is good about wikipedia, systematically going after people and questioning everything they do. Who told you to do this? Why take the 'law' into your own hands? Ask yourself this: who are you protecting? You wanna play cops and robbers, go and joing the freaking police. Do something useful, rather than wasting your life on here. What is your job? Cos this ain't real life buddy, and if its your only hobby, I feel bad for you. I mean really bad. I thinks it's quite sad that your only reason for using wikipedia is to actively reverse what other people do, rather than to actively create content. I will review all the images you 'helpfully' tagged for me, but I'm getting real tired of you ducking me man. Peace out.","Image tagging Look mate, what IS your problem with where images come from? They make wikipedia a better place, regardless of source (and for the record I think that ALL of my images qualify as fair use). you are actually working against everything that is good about wikipedia, systematically going after people and questioning everything they do. Who told you to do this? Why take the 'law' into your own hands? Ask yourself this: who are you protecting? You want to play cops and robbers, go and joing the freaking police. Do something useful, rather than wasting your life on here. What is your job? Cos this are not real life buddy, and if its your only hobby, I feel bad for you. I mean really bad. I thinks it is quite sad that your only reason for using wikipedia is to actively reverse what other people do, rather than to actively create content. I will review all the images you 'helpfully' tagged for me, but I am getting real tired of you ducking me man. Peace out.",image tagging look mate what be your problem with where image come from they make wikipedia a well place regardless of source and for the record i think that all of my image qualify as fair use you be actually work against everything that be good about wikipedia systematically go after people and question everything they do who tell you to do this why take the law into your own hand ask yourself this who be you protect you want to play cop and robber go and joe the freaking police do something useful rather than waste your life on here what be your job cos this be not real life buddy and if its your only hobby i feel bad for you i mean really bad i think it be quite sad that your only reason for use wikipedia be to actively reverse what other people do rather than to actively create content i will review all the image you helpfully tag for i but i be get real tired of you duck i man peace out
12772,Stop deleting content from Wikipedia. You are removing valuable information and it will not be tolerated,Stop deleting content from Wikipedia. You are removing valuable information and it will not be tolerated,Stop deleting content from Wikipedia. You are removing valuable information and it will not be tolerated,stop delete content from wikipedia you be remove valuable information and it will not be tolerate
4211,"Replies to Zebedee\n\nZebedee, I WAS NOT USING THE SAME GOD-DAMN COMPUTER AS THE PERSON WHO DID IT!!!!!!!!!!'Bold text'\n\nI'm sorry if that seemed aggressive, but I hate it when people accuse you of something you haven't done.\n\nWhere is the evidence to suggest that the same computer was used??","Replies to Zebedee Zebedee, I WAS NOT USING THE SAME GOD-DAMN COMPUTER AS THE PERSON WHO DID IT!!!!!!!!!!'Bold text' I'm sorry if that seemed aggressive, but I hate it when people accuse you of something you haven't done. Where is the evidence to suggest that the same computer was used??","Replies to Zebedee, I WAS NOT USING THE SAME GOD-DAMN COMPUTER AS THE PERSON WHO DID IT!!!'Bold text' I am sorry if that seemed aggressive, but I hate it when people accuse you of something you have not done. Where is the evidence to suggest that the same computer was used??",reply to zebedee i be not use the same god damn computer as the person who do it bold text i be sorry if that seem aggressive but i hate it when people accuse you of something you have not do where be the evidence to suggest that the same computer be use
11804,"""\n\n Origins \n\nI think it should be pointed out in the article that Klezmer is """"influenced"""" and heavily borrowed from Balkan and other music of that region of our planet. \n\nIf you were to watch an older Ukrainian or Moldovan or Bulgarian movie, you'd think the background music is Klezmer... Incidentally, like with so many other things Jewish - The Israeli national anthem isn't """"Jewish"""" at all, but an old Italian song, if I'm not mistaken, with Hebrew lyrics. Btw, Hebrew is an artificially revived language. Up until last century it was like Latin - a dead language, used only in religion (which needs to die too). And then it was artificially resuscitated by Zionists. Which is funny, because the founders of teh modern state of Israel were atheists, it's a shame what they have done, giving a bad name to all atheists. """,""" Origins I think it should be pointed out in the article that Klezmer is """"influenced"""" and heavily borrowed from Balkan and other music of that region of our planet. If you were to watch an older Ukrainian or Moldovan or Bulgarian movie, you'd think the background music is Klezmer... Incidentally, like with so many other things Jewish - The Israeli national anthem isn't """"Jewish"""" at all, but an old Italian song, if I'm not mistaken, with Hebrew lyrics. Btw, Hebrew is an artificially revived language. Up until last century it was like Latin - a dead language, used only in religion (which needs to die too). And then it was artificially resuscitated by Zionists. Which is funny, because the founders of teh modern state of Israel were atheists, it's a shame what they have done, giving a bad name to all atheists. """,""" Origins I think it should be pointed out in the article that Klezmer is """"influenced"""" and heavily borrowed from Balkan and other music of that region of our planet. If you were to watch an older Ukrainian or Moldovan or Bulgarian movie, you would think the background music is Klezmer... Incidentally, like with so many other things Jewish - The Israeli national anthem is not """"Jewish"""" at all, but an old Italian song, if I am not mistaken, with Hebrew lyrics. [Btw; by the way], Hebrew is an artificially revived language. Up until last century it was like Latin - a dead language, used only in religion (which needs to die too). And then it was artificially resuscitated by Zionists. Which is funny, because the founders of teh modern state of Israel were atheists, it is a shame what they have done, giving a bad name to all atheists. """,origin i think it should be point out in the article that klezmer be influence and heavily borrow from balkan and other music of that region of our planet if you be to watch an old ukrainian or moldovan or bulgarian movie you would think the background music be klezmer incidentally like with so many other thing jewish the israeli national anthem be not jewish at all but an old italian song if i be not mistaken with hebrew lyric btw by the way hebrew be an artificially revive language up until last century it be like latin a dead language use only in religion which need to die too and then it be artificially resuscitate by zionists which be funny because the founder of teh modern state of israel be atheist it be a shame what they have do give a bad name to all atheist


In [8]:
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2,text3
11886,"Cathel, one thing you ignore and One Night In Hackney has never been able to explain is why Bobby Sands and other Irish Republicans are the only people with pages on Wikipedia where this rule applies to. They are the only pages where factual informaiton is being left out. Cathel, you cannot deny that One Night In Hacknkey is targeting Irish Republican Catholics. You don't see him removing Jewish categories from any other pages, and there are many people in Jewish categories who are not notable for being Jewish. So then why does this rule only apply to Irish Republican Catholics? And of course, the church will condemn any form of violence. Mainstream Muslims leaders also condemn terrorism, does not mean Islamic terrorists should not be in Muslim categories? By your logic, yes. So if I start removing terrorists from Muslim categories, I can count on you to back me up when a person readds the categories?","Cathel, one thing you ignore and One Night In Hackney has never been able to explain is why Bobby Sands and other Irish Republicans are the only people with pages on Wikipedia where this rule applies to. They are the only pages where factual informaiton is being left out. Cathel, you cannot deny that One Night In Hacknkey is targeting Irish Republican Catholics. You don't see him removing Jewish categories from any other pages, and there are many people in Jewish categories who are not notable for being Jewish. So then why does this rule only apply to Irish Republican Catholics? And of course, the church will condemn any form of violence. Mainstream Muslims leaders also condemn terrorism, does not mean Islamic terrorists should not be in Muslim categories? By your logic, yes. So if I start removing terrorists from Muslim categories, I can count on you to back me up when a person readds the categories?","Cathel, one thing you ignore and One Night In Hackney has never been able to explain is why Bobby Sands and other Irish Republicans are the only people with pages on Wikipedia where this rule applies to. They are the only pages where factual informaiton is being left out. Cathel, you cannot deny that One Night In Hacknkey is targeting Irish Republican Catholics. You do not see him removing Jewish categories from any other pages, and there are many people in Jewish categories who are notable for being Jewish. So then why does this rule only apply to Irish Republican Catholics? And of course, the church will condemn any form of violence. Mainstream Muslims leaders also condemn terrorism, does not mean Islamic terrorists should not be in Muslim categories? By your logic, yes. So if I start removing terrorists from Muslim categories, I can count on you to back me up when a person readds the categories?",cathel one thing you ignore and one night in hackney have never be able to explain be why bobby sands and other irish republicans be the only people with page on wikipedia where this rule apply to they be the only page where factual informaiton be be leave out cathel you can not deny that one night in hacknkey be target irish republican catholics you do not see he remove jewish category from any other page and there be many people in jewish category who be notable for be jewish so then why do this rule only apply to irish republican catholics and of course the church will condemn any form of violence mainstream muslims leader also condemn terrorism do not mean islamic terrorist should not be in muslim category by your logic yes so if i start remove terrorist from muslim category i can count on you to back i up when a person readd the category
12459,Making Everbody's Life a Misery \n\nI see that I am not the first person that you have deliberately targetted in a smear campaign. You are a vandal and a disgrace to the good name of Wikpedia.\n\nShame on you.,Making Everbody's Life a Misery I see that I am not the first person that you have deliberately targetted in a smear campaign. You are a vandal and a disgrace to the good name of Wikpedia. Shame on you.,Making Everbody's Life a Misery I see that I am not the first person that you have deliberately targetted in a smear campaign. You are a vandal and a disgrace to the good name of Wikpedia. Shame on you.,make everbody s life a misery i see that i be not the first person that you have deliberately targette in a smear campaign you be a vandal and a disgrace to the good name of wikpedia shame on you
8832,"Speaking without knowing what I'm talking about? OK, how 'bout this: the original comment I made about Everlast's entry constantly being vandalized by Eminem fans was written a month ago before your dumb ass got involved with the site. The feud started with the Dilated Peoples track. That's why I believed that the verse Everlast contributed should be in there (and still do). The jabs taken back and forth after that deserve a synopsis only. Your idiotic little comment about album sales was excessive and unnecessary. It belongs in there no more than the fact that everyone who knows both artists agrees that if they ever got into it, Everlast would knock the living shit outta Eminem. And THAT came off an Eminem fan site.\n\n","Speaking without knowing what I'm talking about? OK, how 'bout this: the original comment I made about Everlast's entry constantly being vandalized by Eminem fans was written a month ago before your dumb ass got involved with the site. The feud started with the Dilated Peoples track. That's why I believed that the verse Everlast contributed should be in there (and still do). The jabs taken back and forth after that deserve a synopsis only. Your idiotic little comment about album sales was excessive and unnecessary. It belongs in there no more than the fact that everyone who knows both artists agrees that if they ever got into it, Everlast would knock the living shit outta Eminem. And THAT came off an Eminem fan site.","Speaking without knowing what I am talking about? OK, how 'bout this: the original comment I made about Everlast's entry constantly being vandalized by Eminem fans was written a month ago before your dumb ass got involved with the site. The feud started with the Dilated Peoples track. that is why I believed that the verse Everlast contributed should be in there (and still do). The jabs taken back and forth after that deserve a synopsis only. Your idiotic little comment about album sales was excessive and unnecessary. It belongs in there no more than the fact that everyone who knows both artists agrees that if they ever got into it, Everlast would knock the living shit outta Eminem. And THAT came off an Eminem fan site.",speak without know what i be talk about ok how bout this the original comment i make about everlast s entry constantly be vandalize by eminem fan be write a month ago before your dumb ass got involve with the site the feud start with the dilated peoples track that be why i believe that the verse everlast contribute should be in there and still do the jab take back and forth after that deserve a synopsis only your idiotic little comment about album sale be excessive and unnecessary it belong in there no more than the fact that everyone who know both artist agree that if they ever get into it everlast would knock the live shit outta eminem and that come off an eminem fan site
13646,"YOUR DISHONEST CONTRIBUTIONS TO THE DAHN YOGA PAGE HAVE BEEN NOTED. SOMEDAY, WHEN ALL THE WORLD SUFFERS, YOU WILL STILL BE ALIVE, A SURVIVOR OF A NUCLEAR WINTER, ON THE BASIS THAT KOREA IS THE MOST IMPORTANT PLACE IN THE WORLD, THAT KOREAN CIVILIZATION IS THE OLDEST AND MOST SUPERIOR, THAT THE KOREAN RACE IS SUPERIOR(ESPECIALLY OVER THOSE BLACK MEN WHO LISTEN TO RAP MUSIC AND SHOOT PEOPLE, THOSE SAVAGES) AND THAT THE NEXT SAVIOR WAS CHOSEN TO APPEAR IN KOREA. YES, MY FRIEND, KOREA IS THE HOME OF ALL, THE BEST, MOST WONDERFUL. ONLY HERE CAN A CIVILIZATION CREATE HANGUL and SOJU! OH YES, KOREA IS NUMBER ONE AND THE HONORABLE, HUMBLE, CELIBATE, POOR, AND SKINNY ILCHEE LEE SHALL BE THE ONE TO USHER IN A NEW ERA OF MANKIND WITH THE BEST PEOPLE IN THE WORLD- SUCCESSFUL KOREANS FROM KOREA TO TEACH DAHN YOGA, AND THE BRIGHTEST, MOST ATTRACTIVE AND WORTHY NORTH AMERICANS, TOGETHER WHO WILL USHER IN A NEW ERA. FOR NOW YOU MUST LIVE ON THE WAGES OF YOUR DAHN SALARY, BUT LATER. LATER. LATER. YOU WILL RULE THE WORLD.\n","YOUR DISHONEST CONTRIBUTIONS TO THE DAHN YOGA PAGE HAVE BEEN NOTED. SOMEDAY, WHEN ALL THE WORLD SUFFERS, YOU WILL STILL BE ALIVE, A SURVIVOR OF A NUCLEAR WINTER, ON THE BASIS THAT KOREA IS THE MOST IMPORTANT PLACE IN THE WORLD, THAT KOREAN CIVILIZATION IS THE OLDEST AND MOST SUPERIOR, THAT THE KOREAN RACE IS SUPERIOR (ESPECIALLY OVER THOSE BLACK MEN WHO LISTEN TO RAP MUSIC AND SHOOT PEOPLE, THOSE SAVAGES) AND THAT THE NEXT SAVIOR WAS CHOSEN TO APPEAR IN KOREA. YES, MY FRIEND, KOREA IS THE HOME OF ALL, THE BEST, MOST WONDERFUL. ONLY HERE CAN A CIVILIZATION CREATE HANGUL and SOJU! OH YES, KOREA IS NUMBER ONE AND THE HONORABLE, HUMBLE, CELIBATE, POOR, AND SKINNY ILCHEE LEE SHALL BE THE ONE TO USHER IN A NEW ERA OF MANKIND WITH THE BEST PEOPLE IN THE WORLD- SUCCESSFUL KOREANS FROM KOREA TO TEACH DAHN YOGA, AND THE BRIGHTEST, MOST ATTRACTIVE AND WORTHY NORTH AMERICANS, TOGETHER WHO WILL USHER IN A NEW ERA. FOR NOW YOU MUST LIVE ON THE WAGES OF YOUR DAHN SALARY, BUT LATER. LATER. LATER. YOU WILL RULE THE WORLD.","YOUR DISHONEST CONTRIBUTIONS TO THE DAHN YOGA PAGE HAVE BEEN NOTED. SOMEDAY, WHEN ALL THE WORLD SUFFERS, YOU WILL STILL BE ALIVE, A SURVIVOR OF A NUCLEAR WINTER, ON THE BASIS THAT KOREA IS THE MOST IMPORTANT PLACE IN THE WORLD, THAT KOREAN CIVILIZATION IS THE OLDEST AND MOST SUPERIOR, THAT THE KOREAN RACE IS SUPERIOR (ESPECIALLY OVER THOSE BLACK MEN WHO LISTEN TO RAP MUSIC AND SHOOT PEOPLE, THOSE SAVAGES) AND THAT THE NEXT SAVIOR WAS CHOSEN TO APPEAR IN KOREA. YES, MY FRIEND, KOREA IS THE HOME OF ALL, THE BEST, MOST WONDERFUL. ONLY HERE CAN A CIVILIZATION CREATE HANGUL and SOJU! OH YES, KOREA IS NUMBER ONE AND THE HONORABLE, HUMBLE, CELIBATE, POOR, AND SKINNY ILCHEE LEE SHALL BE THE ONE TO USHER IN A NEW ERA OF MANKIND WITH THE BEST PEOPLE IN THE WORLD- SUCCESSFUL KOREANS FROM KOREA TO TEACH DAHN YOGA, AND THE BRIGHTEST, MOST ATTRACTIVE AND WORTHY NORTH AMERICANS, TOGETHER WHO WILL USHER IN A NEW ERA. FOR NOW YOU MUST LIVE ON THE WAGES OF YOUR DAHN SALARY, BUT LATER. YOU WILL RULE THE WORLD.",your dishonest contribution to the dahn yoga page have be note someday when all the world suffer you will still be alive a survivor of a nuclear winter on the basis that korea be the most important place in the world that korean civilization be the oldest and most superior that the korean race be superior especially over those black men who listen to rap music and shoot people those savages and that the next savior be choose to appear in korea yes my friend korea be the home of all the good most wonderful only here can a civilization create hangul and soju oh yes korea be number one and the honorable humble celibate poor and skinny ilchee lee shall be the one to usher in a new era of mankind with the best people in the world successful koreans from korea to teach dahn yoga and the brightest most attractive and worthy north americans together who will usher in a new era for now you must live on the wage of your dahn salary but later you will rule the world
3056,"""\nBan one side of an argument by a bullshit nazi admin and you get no discussion because the islamist editors feel they """"won"""".""",""" Ban one side of an argument by a bullshit nazi admin and you get no discussion because the islamist editors feel they """"won"".""""",""" Ban one side of an argument by a bullshit nazi admin and you get no discussion because the islamist editors feel they """"won"".""""",ban one side of an argument by a bullshit nazi admin and you get no discussion because the islamist editor feel they win
11776,"I disagree, it is not missing the point because early translations do not find fault with homosexual acts unless done for the benefit of a competing god. 64.31.174.130 MRM 9/16/2008","I disagree, it is not missing the point because early translations do not find fault with homosexual acts unless done for the benefit of a competing god. MRM 9/16/2008","I disagree, it is not missing the point because early translations do not find fault with homosexual acts unless done for the benefit of a competing god. MRM 9/16/2008",i disagree it be not miss the point because early translation do not find fault with homosexual act unless do for the benefit of a compete god mrm 9 16 2008
5443,"""\n Awaaze read your pov """" 8000 cases of missing persons have been reported since the start of the war on terror from different parts of the country."""" scoop lol your basically lying througjh your teeth about azad kashmir """,""" Awaaze read your pov """" 8000 cases of missing persons have been reported since the start of the war on terror from different parts of the country."""" scoop lol your basically lying througjh your teeth about azad kashmir """,""" Awaaze read your [pov; point of view] """" 8000 cases of missing persons have been reported since the start of the war on terror from different parts of the country."""" scoop [lol; laughing out loud] your basically lying througjh your teeth about azad kashmir """,awaaze read your pov point of view 8000 case of miss person have be report since the start of the war on terror from different part of the country scoop lol laugh out loud your basically lie througjh your tooth about azad kashmir
3529,Problem editor\nI noticed that you blocked (talk),Problem editor I noticed that you blocked (talk),Problem editor I noticed that you blocked (talk),problem editor i notice that you block talk
6215,I'm citing your mother.,I'm citing your mother.,I am citing your mother.,i be cite your mother
4445,"""\n\n Sodomy \n\nHave you thought about creating a """"Wikipedians who enjoy sodomy"""" for yourself and other Wikipedia editors like you?""",""" Sodomy Have you thought about creating a """"Wikipedians who enjoy sodomy"""" for yourself and other Wikipedia editors like you?""",""" Sodomy Have you thought about creating a """"Wikipedians who enjoy sodomy"""" for yourself and other Wikipedia editors like you?""",sodomy have you think about create a wikipedians who enjoy sodomy for yourself and other wikipedia editor like you


# Review data

In [9]:
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
 3   text3   14251 non-null  object
dtypes: object(4)
memory usage: 445.5+ KB


In [10]:
%%time
df[cols].to_parquet("output/pre_val.parquet", index=False)

Wall time: 130 ms
