In [1]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
%%time
df = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB
CPU times: user 156 ms, sys: 15.6 ms, total: 172 ms
Wall time: 202 ms


In [4]:
texts = set(df["less_toxic"]) | set(df["more_toxic"])
texts = list(texts)
df = pd.DataFrame(data={"text": texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
dtypes: object(1)
memory usage: 111.5+ KB


# Preprocess Text
Speed dropped from 1200 to 1000 it/s

In [5]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


def pre3(row) -> str:
    return mylib.pre3(row["text2"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)
col = "text3"
print(col)
df[col] = df.progress_apply(pre3, axis=1)

text1


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14251/14251 [00:37<00:00, 381.07it/s]


text2


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14251/14251 [04:32<00:00, 52.36it/s]


text3


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14251/14251 [02:30<00:00, 95.00it/s]


In [6]:
cols = ["text", "text1", "text2", "text3"]
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2,text3
13704,"Incidently, the ignorant only ignores the truth.","Incidently, the ignorant only ignores the truth.","Incidently, the ignorant only ignores the truth.",incidently the ignorant only ignore the truth
6429,"Fantastic! Give me a block and blantantly ignore the behaviours of the others. So, it's controversal to state why i'm so irate, but yet it's within guidelines to:\n Insult my intelligence\n Degrade my character\n and overall tell me to fuck off?\n\nWow. 4:21 15 January 2012 (UTC)","Fantastic! Give me a block and blantantly ignore the behaviours of the others. So, it's controversal to state why i'm so irate, but yet it's within guidelines to: Insult my intelligence Degrade my character and overall tell me to fuck off? Wow. 4:21 15 January 2012 (UTC)","Fantastic! Give me a block and blantly ignore the behaviours of the others. So, it is controversal to state why I am so irate, but yet it is within guidelines to: Insult my intelligence Degrade my character and overall tell me to fuck off? Wow. 4:21 15 January 2012 (UTC)",fantastic give i a block and blantly ignore the behaviour of the other so it be controversal to state why i be so irate but yet it be within guideline to insult my intelligence degrade my character and overall tell i to fuck off wow 4 21 15 january 2012 utc
5922,"Plot Hole\nSTOP MESSING AROUND WITH THE PLOT HOLE PART!!! IT DESERVES TO BE THERE!!! SO STOP ERASING IT!!!Tim, 6 January, 2006","Plot Hole STOP MESSING AROUND WITH THE PLOT HOLE PART!!! IT DESERVES TO BE THERE!!! SO STOP ERASING IT!!!Tim, 6 January, 2006","Plot Hole STOP MESSING AROUND WITH THE PLOT HOLE PART!!! IT DESERVES TO BE THERE!!! SO STOP ERASING IT!!!Tim, 6 January, 2006",plot hole stop mess around with the plot hole part it deserve to be there so stop erase it tim 6 january 2006
5543,You must be logged in to upload files. \n\nSo In other words I have to sign up just to add 3 things to 1 entry.\n\nTo hell with that!!!!! This place can ROT for all I care!!!\n\n75.8.35.177,You must be logged in to upload files. So In other words I have to sign up just to add 3 things to 1 entry. To hell with that!!!!! This place can ROT for all I care!!!,You must be logged in to upload files. So In other words I have to sign up just to add 3 things to 1 entry. To hell with that!!!! This place can ROT for all I care!!!,you must be log in to upload file so in other word i have to sign up just to add 3 thing to 1 entry to hell with that this place can rot for all i care
8339,"Can you explain why you are needed? Excuse me, but removing thigns I wrote on my page would be the same as me removing things from YOUR user page. timecop","Can you explain why you are needed? Excuse me, but removing thigns I wrote on my page would be the same as me removing things from YOUR user page. timecop","Can you explain why you are needed? Excuse me, but removing thigns I wrote on my page would be the same as me removing things from YOUR user page. timecop",can you explain why you be need excuse i but remove thign i write on my page would be the same as i remove thing from your user page timecop
13584,"the truth \n\nPLease explain to me, sentence by sentence, why you have edited out the following addition:\n\nThe main impact zone of the crash was approximately 19 metres (57 feet) in width. No wreckage was found from the airliner within this impact zone or inside the building. The lawn immediately in front of the crash site was unmarked. Within the main impact zone was a hole that the object punched in the building, approximately 9 metres (27 feet) in width. The Pentagon is composed of five concentric rings. Only the outer three rings were penetrated. The upper floors of the first ring collapsed, but the structural integrity of the second and third rings remained. At the end of the third ring was a circular 'punch out' hole, again 9 metres (27 feet) in diameter. The wingspan of a Boeing 757 is 38 metres (114 feet). The vast majority of the mass of the aircraft remains unnaccounted for. For flight 77 to have caused the damage observed at the Pentagon, the wings would have had to have been broken off, pushed into the fuselage, after which the fueselage and wings would have had to have been compressed sufficiently to fit through the 9 metre hole. No physical mechanism by which this process could have occurred is known.\n\nFlight 77 did not hit the pentagon. Its so obvious as to be comical. The capacity of human beings to believe what is demonstrably complete bullshit because they are afraid of going against the group consensus never ceases to amaze me. You are being lied to, day in, day out, by your media. Stop. Think. Please.\n\nIn America everybody is of the opinion that he has no social superiors, since all men are equal, but he does not admit that he has no social inferiors, for, from the time of Jefferson onward, the doctrine that all men are equal applies only upwards, not downwards. \nBertrand Russell\n","the truth PLease explain to me, sentence by sentence, why you have edited out the following addition: The main impact zone of the crash was approximately 19 metres (57 feet) in width. No wreckage was found from the airliner within this impact zone or inside the building. The lawn immediately in front of the crash site was unmarked. Within the main impact zone was a hole that the object punched in the building, approximately 9 metres (27 feet) in width. The Pentagon is composed of five concentric rings. Only the outer three rings were penetrated. The upper floors of the first ring collapsed, but the structural integrity of the second and third rings remained. At the end of the third ring was a circular 'punch out' hole, again 9 metres (27 feet) in diameter. The wingspan of a Boeing 757 is 38 metres (114 feet). The vast majority of the mass of the aircraft remains unnaccounted for. For flight 77 to have caused the damage observed at the Pentagon, the wings would have had to have been broken off, pushed into the fuselage, after which the fueselage and wings would have had to have been compressed sufficiently to fit through the 9 metre hole. No physical mechanism by which this process could have occurred is known. Flight 77 did not hit the pentagon. Its so obvious as to be comical. The capacity of human beings to believe what is demonstrably complete bullshit because they are afraid of going against the group consensus never ceases to amaze me. You are being lied to, day in, day out, by your media. Stop. Think. Please. In America everybody is of the opinion that he has no social superiors, since all men are equal, but he does not admit that he has no social inferiors, for, from the time of Jefferson onward, the doctrine that all men are equal applies only upwards, not downwards. Bertrand Russell","the truth PLease explain to me, sentence by sentence, why you have edited out the following addition: The main impact zone of the crash was approximately 19 metres (57 feet) in width. No wreckage was found from the airliner within this impact zone or inside the building. The lawn immediately in front of the crash site was unmarked. Within the main impact zone was a hole that the object punched in the building, approximately 9 metres (27 feet) in width. The Pentagon is composed of five concentric rings. Only the outer three rings were penetrated. The upper floors of the first ring collapsed, but the structural integrity of the second and third rings remained. At the end of the third ring was a circular 'punch out' hole, again 9 metres (27 feet) in diameter. The wingspan of a Boeing 757 is 38 metres (114 feet). The vast majority of the mass of the aircraft remains unnaccounted for. For flight 77 to have caused the damage observed at the Pentagon, the wings would have had to have been broken off, pushed into the fuselage, after which the fueselage and wings would have had to have been compressed sufficiently to fit through the 9 metre hole. No physical mechanism by which this process could have occurred is known. Flight 77 did not hit the pentagon. Its so obvious as to be comical. The capacity of human beings to believe what is demonstrably complete bullshit because they are afraid of going against the group consensus never ceases to amaze me. You are being lied to, day in, day out, by your media. Stop. Think. Please. In America everybody is of the opinion that he has no social superiors, since all men are equal, but he does not admit that he has no social inferiors, for, from the time of Jefferson onward, the doctrine that all men are equal applies only upwards, not downwards. Bertrand Russell",the truth please explain to i sentence by sentence why you have edit out the following addition the main impact zone of the crash be approximately 19 metre 57 foot in width no wreckage be find from the airliner within this impact zone or inside the building the lawn immediately in front of the crash site be unmarked within the main impact zone be a hole that the object punch in the building approximately 9 metre 27 foot in width the pentagon be compose of five concentric ring only the outer three ring be penetrate the upper floor of the first ring collapse but the structural integrity of the second and third ring remain at the end of the third ring be a circular punch out hole again 9 metre 27 foot in diameter the wingspan of a boeing 757 be 38 metre 114 foot the vast majority of the mass of the aircraft remain unnaccounte for for flight 77 to have cause the damage observe at the pentagon the wing would have have to have be break off push into the fuselage after which the fueselage and wing would have have to have be compress sufficiently to fit through the 9 metre hole no physical mechanism by which this process could have occur be know flight 77 do not hit the pentagon its so obvious as to be comical the capacity of human being to believe what be demonstrably complete bullshit because they be afraid of go against the group consensus never cease to amaze i you be be lie to day in day out by your medium stop think please in america everybody be of the opinion that he have no social superior since all man be equal but he do not admit that he have no social inferior for from the time of jefferson onward the doctrine that all man be equal apply only upwards not downwards bertrand russell
8338,"__________________________\nListen piece of bastard, if yo have a chance to read this , read carefully:\nYOU DARE SPOIL MY USERPAGE AND YOU WILL DIE FOR THAT.\nI ALREADY TRACKED YOU AND I KNOW WHO YOU ARE, BEFORE KILLING YOU , I WILL RAPE AND KILL THE BITCH OF AYOUR MOTHER AND THAN TORTURE \nTHE WHOLE OF YOUR FAMILY.\nWHAT A PITY YOU DON T HAVE A SMALL DAUGHTER IN ORDER TO RAPE ,TORTURE TO DEATH AND BURN HER ALIVE.\nYOU ARE A DEAD MAN.\nTHERE IS NO PLACE TO HIDE IN TEH WORLD FOR YOU","__________________________ Listen piece of bastard, if yo have a chance to read this , read carefully: YOU DARE SPOIL MY USERPAGE AND YOU WILL DIE FOR THAT. I ALREADY TRACKED YOU AND I KNOW WHO YOU ARE, BEFORE KILLING YOU , I WILL RAPE AND KILL THE BITCH OF AYOUR MOTHER AND THAN TORTURE THE WHOLE OF YOUR FAMILY. WHAT A PITY YOU DON T HAVE A SMALL DAUGHTER IN ORDER TO RAPE ,TORTURE TO DEATH AND BURN HER ALIVE. YOU ARE A DEAD MAN. THERE IS NO PLACE TO HIDE IN TEH WORLD FOR YOU","____ Listen piece of bastard, if yo have a chance to read this , read carefully: YOU DARE SPOIL MY USERPAGE AND YOU WILL DIE FOR THAT. I ALREADY TRACKED YOU AND I KNOW WHO YOU ARE, BEFORE KILLING YOU , I WILL RAPE AND KILL THE BITCH OF AYOUR MOTHER AND THAN TORTURE THE WHOLE OF YOUR FAMILY. WHAT A PITY YOU DON T HAVE A SMALL DAUGHTER IN ORDER TO RAPE ,TORTURE TO DEATH AND BURN HER ALIVE. YOU ARE A DEAD MAN. THERE IS NO PLACE TO HIDE IN TEH WORLD FOR YOU",listen piece of bastard if yo have a chance to read this read carefully you dare spoil my userpage and you will die for that i already track you and i know who you be before kill you i will rape and kill the bitch of ayour mother and than torture the whole of your family what a pity you don t have a small daughter in order to rape torture to death and burn she alive you be a dead man there be no place to hide in teh world for you
6652,You're still a snivelling little cunt and a foul fruit,You're still a snivelling little cunt and a foul fruit,you are still a snivelling little [profanity for a woman's vagina] and a foul fruit,you be still a snivelling little profanity for a woman s vagina and a foul fruit
379,"i need to leave fisherQueen a message...WE'RE BACK!!!!!!!!!!!!!! well only on this page.. but soon we will be after you!!!!!!!!!!!! Just kidding...maybe...you will never know! THEY are after you and your lesbian friends!!!!!! And merope, you cant call that comment pearsonal offence because it says on her user page!!!!!!!!!!!!!!!!!!!!!","i need to leave fisherQueen a message...WE'RE BACK!!!!!!!!!!!!!! well only on this page.. but soon we will be after you!!!!!!!!!!!! Just kidding...maybe...you will never know! THEY are after you and your lesbian friends!!!!!! And merope, you cant call that comment pearsonal offence because it says on her user page!!!!!!!!!!!!!!!!!!!!!","i need to leave fisherQueen a message...we are BACK!!!! well only on this page.. but soon we will be after you!!! Just kidding...maybe...you will never know! THEY are after you and your lesbian friends!!! And merope, you cant call that comment pearsonal offence because it says on her user page!!!",i need to leave fisherqueen a message we be back well only on this page but soon we will be after you just kid maybe you will never know they be after you and your lesbian friend and merope you ca nt call that comment pearsonal offence because it say on her user page
10028,"""\n\n Donaldduckfan101 \n\nI have filed an SPI case against him in connection to Bambifan101. ╟─►the name is a lie.─╢ """,""" Donaldduckfan101 I have filed an SPI case against him in connection to Bambifan101. the name is a lie. """,""" Donaldduckfan101 I have filed an SPI case against him in connection to Bambifan101. the name is a lie. """,donaldduckfan101 i have file an spi case against he in connection to bambifan101 the name be a lie


In [7]:
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2,text3
1092,"""\n\nps: Having an anonymous IP does not mean being """"everybody`s sockpuppet""""""",""" ps: Having an anonymous IP does not mean being """"everybody's sockpuppet""""""",""" ps: Having anonymous IP does not mean being """"everybody is sockpuppet""""""",p have anonymous ip do not mean be everybody be sockpuppet
332,"Adming is the one who abuses Wikipedia to force his stupid personal taste for just that one particular article. He is an admin and tries to make just one article out of line with wikipedia standards and style. Instead of bothering me, go ask that idiot why he tries to make Wikipedia articles inconsistent. 02:47, 27 Mar 2005 (UTC)","Adming is the one who abuses Wikipedia to force his stupid personal taste for just that one particular article. He is an admin and tries to make just one article out of line with wikipedia standards and style. Instead of bothering me, go ask that idiot why he tries to make Wikipedia articles inconsistent. 02:47, 27 Mar 2005 (UTC)","Adming is the one who abuses Wikipedia to force his stupid personal taste for just that one particular article. He is an admin and tries to make just one article out of line with wikipedia standards and style. Instead of bothering me, go ask that idiot why he tries to make Wikipedia articles inconsistent. 02:47, 27 Mar 2005 (UTC)",adming be the one who abuse wikipedia to force his stupid personal taste for just that one particular article he be an admin and try to make just one article out of line with wikipedia standard and style instead of bother i go ask that idiot why he try to make wikipedia article inconsistent 02 47 27 mar 2005 utc
10639,"""\nI don't know why are you so obsessed with Hindu script that you keep adding it to every article that comes in your way. Stop doing it. Stop trying to Indianize every Pakistan and Islam related article. Inida is not a Muslim country. It is a Hindu country. If your logic is followed, we need hundreds of transliterations of """"purdah"""" for every language of Purdah observing people. It's ridiculous. """,""" I don't know why are you so obsessed with Hindu script that you keep adding it to every article that comes in your way. Stop doing it. Stop trying to Indianize every Pakistan and Islam related article. Inida is not a Muslim country. It is a Hindu country. If your logic is followed, we need hundreds of transliterations of """"purdah"""" for every language of Purdah observing people. It's ridiculous. """,""" I do not know why are you so obsessed with Hindu script that you keep adding it to every article that comes in your way. Stop doing it. Stop trying to Indianize every Pakistan and Islam related article. Inida is not a Muslim country. It is a Hindu country. If your logic is followed, we need hundreds of transliterations of """"purdah"""" for every language of Purdah observing people. it is ridiculous. """,i do not know why be you so obsessed with hindu script that you keep add it to every article that come in your way stop do it stop try to indianize every pakistan and islam relate article inida be not a muslim country it be a hindu country if your logic be follow we need hundred of transliteration of purdah for every language of purdah observe people it be ridiculous
1799,"suck my dick first, maybe then i'll consider what you have to say.","suck my dick first, maybe then i'll consider what you have to say.","suck my dick first, maybe then I will consider what you have to say.",suck my dick first maybe then i will consider what you have to say
12123,"And go ahead and delete my account, I can't wait for it, I'm sick of having to check every retarded message you people post in my page!... If it gets deleted it gets permanently blanked!...","And go ahead and delete my account, I can't wait for it, I'm sick of having to check every retarded message you people post in my page!... If it gets deleted it gets permanently blanked!...","And go ahead and delete my account, I cannot wait for it, I am sick of having to check every retarded message you people post in my page!... If it gets deleted it gets permanently blanked!...",and go ahead and delete my account i can not wait for it i be sick of have to check every retarded message you people post in my page if it gets delete it gets permanently blank
5296,"""\n\n Euthanasia \n\nPlease revert your reversion of my edits. I am an experienced editor who has edited under other accounts in the past, and I know what I'm doing. Thank you. \n\n I have already used the Talk page and my deletions actually follow consensus. The Aktion T4 material was added to the page by an editor who is currently the subject of a POV pushing complaint. The Nazis used """"euthanasia"""" as a smokescreen to hide what experts agree was murder, plain and simple. I am allows to be bold without you reverting for the only reason that you think i am a newbie. I've told you I am not a newbie, so revert please. """,""" Euthanasia Please revert your reversion of my edits. I am an experienced editor who has edited under other accounts in the past, and I know what I'm doing. Thank you. I have already used the Talk page and my deletions actually follow consensus. The Aktion T4 material was added to the page by an editor who is currently the subject of a POV pushing complaint. The Nazis used """"euthanasia"""" as a smokescreen to hide what experts agree was murder, plain and simple. I am allows to be bold without you reverting for the only reason that you think i am a newbie. I've told you I am not a newbie, so revert please. """,""" Euthanasia Please revert your reversion of my edits. I am an experienced editor who has edited under other accounts in the past, and I know what I am doing. Thank you. I have already used the Talk page and my deletions actually follow consensus. The Aktion T4 material was added to the page by an editor who is currently the subject of a [point of view] pushing complaint. The Nazis used """"euthanasia"""" as a smokescreen to hide what experts agree was murder, plain and simple. I am allows to be bold without you reverting for the only reason that you think i am a [new joiner; an inexperienced or unskilled person, especially in video games]. I have told you I am not a [new joiner; an inexperienced or unskilled person, especially in video games], so revert please. """,euthanasia please revert your reversion of my edit i be an experienced editor who have edit under other account in the past and i know what i be do thank you i have already use the talk page and my deletion actually follow consensus the aktion t4 material be add to the page by an editor who be currently the subject of a point of view push complaint the nazis use euthanasia as a smokescreen to hide what expert agree be murder plain and simple i be allow to be bold without you revert for the only reason that you think i be a new joiner an inexperienced or unskilled person especially in video game i have tell you i be not a new joiner an inexperienced or unskilled person especially in video game so revert please
8877,"""\n\n Don't miss it! \n\nI am trying to get those pesky unofficial signs taken down from the Taxpayer March on Washington page. Once again gave you credit on the Talk page. Hope you can join the discussion and stop Wikipedia from """"smearing"""" the tens of thousands of normal people with pictures of a few crazy people with signs. In Peace, """,""" Don't miss it! I am trying to get those pesky unofficial signs taken down from the Taxpayer March on Washington page. Once again gave you credit on the Talk page. Hope you can join the discussion and stop Wikipedia from """"smearing"""" the tens of thousands of normal people with pictures of a few crazy people with signs. In Peace, """,""" do not miss it! I am trying to get those pesky unofficial signs taken down from the Taxpayer March on Washington page. Once again gave you credit on the Talk page. Hope you can join the discussion and stop Wikipedia from """"smearing"""" the tens of thousands of normal people with pictures of a few crazy people with signs. In Peace, """,do not miss it i be try to get those pesky unofficial sign take down from the taxpayer march on washington page once again give you credit on the talk page hope you can join the discussion and stop wikipedia from smear the ten of thousand of normal people with picture of a few crazy people with sign in peace
10707,"""\n\n What Does She Acctually Own??? \n\nCos, I haven't got a clue her occupation here and here, clearly state """"Businesswoman"""" but what the hell does she own???, does anyone know?? — """,""" What Does She Acctually Own??? Cos, I haven't got a clue her occupation here and here, clearly state """"Businesswoman"""" but what the hell does she own???, does anyone know?? - """,""" What Does She Acctually Own??? Cos, I have not got a clue her occupation here and here, clearly state """"Businesswoman"""" but what the hell does she own???, does anyone know?? - """,what do she acctually own cos i have not get a clue her occupation here and here clearly state businesswoman but what the hell do she own do anyone know
14132,"Horseshit, I didn't vandalize that page. Any idiot could see that wasn't vandalism. WIKIPEDIA IS A FUCKING JOKE!71.232.157.145","Horseshit, I didn't vandalize that page. Any idiot could see that wasn't vandalism. WIKIPEDIA IS A FUCKING JOKE!","Horseshit, I did not vandalize that page. Any idiot could see that was not vandalism. WIKIPEDIA IS A FUCKING JOKE!",horseshit i do not vandalize that page any idiot could see that be not vandalism wikipedia be a fucking joke
3710,"Please stop. If you continue to vandalise Wikipedia, you will be blocked. —","Please stop. If you continue to vandalise Wikipedia, you will be blocked. -","Please stop. If you continue to vandalise Wikipedia, you will be blocked. -",please stop if you continue to vandalise wikipedia you will be block


In [8]:
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2,text3
9654,"could have fooled me, hens!\nlast time i checked you come from the low lands and have edited tape","could have fooled me, hens! last time i checked you come from the low lands and have edited tape","could have fooled me, hens! last time i checked you come from the low lands and have edited tape",could have fool i hen last time i check you come from the low land and have edit tape
736,"""\nI'm ignorant of these matters, but it looks like Germany has laws against denying the Holocaust or approving Nazi actions in a way that """"disturbs the peace"""", but not """"anti-Semitism"""" broadly drawn. I don't know whether German prosecutors are this unreasonable, but I know in the U.S. they have actually prosecuted children for making """"child pornography"""" of themselves ... if the comment actually is illegal to make there, I'm not sure it is safe even for a victim to keep reposting it. But Wikipedia itself should not be eager to gather up censorship laws from around the world.\nIt's too hard for me to puzzle through all the German text relevant to this case (which Google Translate handles far more poorly than Chinese or Arabic, alas). But I think as we're seeing in the present Fae ArbCom case, there's a huge divide in perceptions of bias between those who are members of an ethnic group, and those who are outside of it. Is it anti-gay to say that marriage should be between one man and one woman, or anti-Semitic to say that the U.S. should end all foreign aid to Israel? Or to keep a street named after a historical figure with a notable animosity to either group? Your answer depends on who you are. If we are to continue to have one Wikipedia for every nation and background, we'll have to be willing to accommodate that terms like """"homophobic"""" and """"anti-Semitic"""" mean very different things to very different people, and not punish anyone for using them the way they personally perceive them. """,""" I'm ignorant of these matters, but it looks like Germany has laws against denying the Holocaust or approving Nazi actions in a way that """"disturbs the peace"","" but not """"anti-Semitism"""" broadly drawn. I don't know whether German prosecutors are this unreasonable, but I know in the U.S. they have actually prosecuted children for making """"child pornography"""" of themselves ... if the comment actually is illegal to make there, I'm not sure it is safe even for a victim to keep reposting it. But Wikipedia itself should not be eager to gather up censorship laws from around the world. It's too hard for me to puzzle through all the German text relevant to this case (which Google Translate handles far more poorly than Chinese or Arabic, alas). But I think as we're seeing in the present Fae ArbCom case, there's a huge divide in perceptions of bias between those who are members of an ethnic group, and those who are outside of it. Is it anti-gay to say that marriage should be between one man and one woman, or anti-Semitic to say that the U.S. should end all foreign aid to Israel? Or to keep a street named after a historical figure with a notable animosity to either group? Your answer depends on who you are. If we are to continue to have one Wikipedia for every nation and background, we'll have to be willing to accommodate that terms like """"homophobic"""" and """"anti-Semitic"""" mean very different things to very different people, and not punish anyone for using them the way they personally perceive them. """,""" I am ignorant of these matters, but it looks like Germany has laws against denying the Holocaust or approving Nazi actions in a way that """"disturbs the peace"","" but not """"anti-Semitism"""" broadly drawn. I do not know whether German prosecutors are this unreasonable, but I know in the U.S. they have actually prosecuted children for making """"child pornography"""" of themselves ... if the comment actually is illegal to make there, I am not sure it is safe even for a victim to keep reposting it. But Wikipedia itself should not be eager to gather up censorship laws from around the world. it is too hard for me to puzzle through all the German text relevant to this case (which Google Translate handles far more poorly than Chinese or Arabic, alas). But I think as we are seeing in the present Fae ArbCom case, there is a huge divide in perceptions of bias between those who are members of an ethnic group, and those who are outside of it. Is it anti-gay to say that marriage should between one man and one woman, or anti-Semitic to say that the U.S. should end all foreign aid to Israel? Or to keep a street named after a historical figure with a notable animosity to either group? Your answer depends on who you are. If we are to continue to have one Wikipedia for every nation and background, we will have to be willing to accommodate that terms like """"homophobic"""" and """"anti-Semitic"""" mean very different things to very different people, and not punish anyone for using them the way they personally perceive them. """,i be ignorant of these matter but it look like germany have law against deny the holocaust or approve nazi action in a way that disturb the peace but not anti semitism broadly draw i do not know whether german prosecutor be this unreasonable but i know in the u s they have actually prosecute child for make child pornography of themselves if the comment actually be illegal to make there i be not sure it be safe even for a victim to keep reposte it but wikipedia itself should not be eager to gather up censorship law from around the world it be too hard for i to puzzle through all the german text relevant to this case which google translate handle far more poorly than chinese or arabic alas but i think as we be see in the present fae arbcom case there be a huge divide in perception of bias between those who be member of an ethnic group and those who be outside of it be it anti gay to say that marriage should between one man and one woman or anti semitic to say that the u s should end all foreign aid to israel or to keep a street name after a historical figure with a notable animosity to either group your answer depend on who you be if we be to continue to have one wikipedia for every nation and background we will have to be willing to accommodate that term like homophobic and anti semitic mean very different thing to very different people and not punish anyone for use they the way they personally perceive they
12521,Stan Kemp \n\nIt's also the site's biggest shortfall. Any retard who can push a button can edit it.,Stan Kemp It's also the site's biggest shortfall. Any retard who can push a button can edit it.,Stan Kemp it is also the site's biggest shortfall. Any retard who can push a button can edit it.,stan kemp it be also the site s big shortfall any retard who can push a button can edit it
4532,Your are this naked Woman!\n\n,Your are this naked Woman!,Your are this naked Woman!,your be this naked woman
11562,Why not? The man (Bkonrad) is an asshole. Look at how rudely he treats other editors.,Why not? The man (Bkonrad) is an asshole. Look at how rudely he treats other editors.,Why not? The man (Bkonrad) is an asshole. Look at how rudely he treats other editors.,why not the man bkonrad be an asshole look at how rudely he treat other editor
3318,"you wanna know what I think tony? \n\nand yeh\n\ngo ahed n label this az Iamandrewrice\n\nit wil only make me laugh at ur ignorance even mor\n\nur just a homofobe yeh blatez!\n\n'sup blud?\n\nshor mayte!\n\nwanna no wot the prob iz with u?\n\nu need 2 f'in get a lyf mayt!\n\nget a gf!\n\nget a bf!\n\nbut for f's sake!\n\ndont take out ur ugliness on other peeps!\n\nf yeah!\n\nryte then,\n\na'm off ter vandalize now\n\nblatez lolz\n\nyeh babe!","you wanna know what I think tony? and yeh go ahed n label this az Iamandrewrice it wil only make me laugh at ur ignorance even mor ur just a homofobe yeh blatez! 'sup blud? shor mayte! wanna no wot the prob iz with u? u need 2 f'in get a lyf mayt! get a gf! get a bf! but for f's sake! dont take out ur ugliness on other peeps! f yeah! ryte then, a'm off ter vandalize now blatez lolz yeh babe!","you want to know what I think tony? and yeh go ahed n label this az Iamandrewrice it wil only make me laugh at ur ignorance even mor ur just a homofobe yeh blatez! 'sup blud? shor mayte! want to no wot the prob iz with u? u need 2 f'in get a lyf mayt! get a [girlfriend]! get a [boyfriend or best friend]! but for f's sake! dont take out ur ugliness on other [people]! f yeah! ryte then, a'm off ter vandalize now blatez [laughing out loud] yeh babe!",you want to know what i think tony and yeh go ahed n label this az iamandrewrice it wil only make i laugh at ur ignorance even mor ur just a homofobe yeh blatez sup blud shor mayte want to no wot the prob iz with u u need 2 f in get a lyf mayt get a girlfriend get a boyfriend or good friend but for f s sake do nt take out ur ugliness on other people f yeah ryte then a m off ter vandalize now blatez laugh out loud yeh babe
10822,"""\nThanks for picking up the silly errors. Ponting only has two siblings, while he was 14 when he hurt his arm. Interesting that it was his right arm.... '''' (talk'') Review me! """,""" Thanks for picking up the silly errors. Ponting only has two siblings, while he was 14 when he hurt his arm. Interesting that it was his right arm.... "" "" (talk "" ) Review me! """,""" Thanks for picking up the silly errors. Ponting only has two siblings, while he was 14 when he hurt his arm. Interesting that it was his right arm... "" "" (talk "" ) Review me! """,thank for pick up the silly error ponting only have two sibling while he be 14 when he hurt his arm interesting that it be his right arm talk review i
2688,Bad accent \n\nHer attempts to do a working-class accent are laughable; half the vowels are as posh as can be.,Bad accent Her attempts to do a working-class accent are laughable; half the vowels are as posh as can be.,Bad accent Her attempts to do a working-class accent are laughable; half the vowels are as posh as can be.,bad accent her attempt to do a work class accent be laughable half the vowel be as posh as can be
1787,"""\n\nwikipedia's censorship\n\nI looked up Bash Back (The sodomite terrorist group that attacked the Church in Michigan) on wikipedia (http://en.wikipedia.org/wiki/Bash_Back) just to see what liberal bias they would have. Under the section """"Antifascist action at 2008 Milwaukee Pridefest"""" Wakopedia says """"In response, Bash Back! Milwaukee planned a confrontation of the hate group."""" the """"hate group they are talking about is the NAZIs but then if the NAZIs are a hate group then Bash back must also be a hate group so I edited it to say """"In response, The hate group Bash Back! Milwaukee planned a confrontation of the hate group.""""\n\nSince they also didn't have any topic for the new terrorist attack on the church I added a few lines & was attacked by the Admins. So they added the following paragraph\n2008 Disruption at Mt. Hope Church in Lansing, MI\n\nBash Back! members disrupted a Sunday sermon at Mt. Hope Church in Lansing, Michigan on November 9th, 2008. They dropped a banner in the church, threw thousands of fliers, made out in front of the congregation, yelled """"Jesus was a Homo"""", pulled the fire alarm and initiated pro-gay chants. Mt. Hope is an Assemblies of God church which holds and promotes """"ex-gay"""" events and preaches that homosexuality is a sin.\n\nI edited the first line of the paragraph to read """"Christaphobic Bash Back! members disrupted.... & added the following footnote \n\n""""Christphobia (from Greek Christós: Messiah; phóbos: fear, phobia) is an irrational fear of, aversion to, or discrimination against Christianity.""""\n\nI took the defanition word for wors from Wakopedia's definition for Homophobia & just changed the Homo/Christos & Homo/ Christian parts.\n\nFor this I received the following note from the admin\n\nThis is the last warning you will receive for your disruptive edits, such as those you made to Bash Back. If you vandalize Wikipedia again, you will be blocked from editing. Please maintain a neutral point of view. ukexpat (talk) \n\nFunny how liberals love to censor everyone & then call us evil""",""" wikipedia's censorship I looked up Bash Back (The sodomite terrorist group that attacked the Church in Michigan) on wikipedia (just to see what liberal bias they would have. Under the section """"Antifascist action at 2008 Milwaukee Pridefest"""" Wakopedia says """"In response, Bash Back! Milwaukee planned a confrontation of the hate group."""" the """"hate group they are talking about is the NAZIs but then if the NAZIs are a hate group then Bash back must also be a hate group so I edited it to say """"In response, The hate group Bash Back! Milwaukee planned a confrontation of the hate group."""" Since they also didn't have any topic for the new terrorist attack on the church I added a few lines & was attacked by the Admins. So they added the following paragraph 2008 Disruption at Mt. Hope Church in Lansing, MI Bash Back! members disrupted a Sunday sermon at Mt. Hope Church in Lansing, Michigan on November 9th, 2008. They dropped a banner in the church, threw thousands of fliers, made out in front of the congregation, yelled """"Jesus was a Homo"","" pulled the fire alarm and initiated pro-gay chants. Mt. Hope is an Assemblies of God church which holds and promotes """"ex-gay"""" events and preaches that homosexuality is a sin. I edited the first line of the paragraph to read """"Christaphobic Bash Back! members disrupted.... & added the following footnote """"Christphobia (from Greek Christos: Messiah; phobos: fear, phobia) is an irrational fear of, aversion to, or discrimination against Christianity."""" I took the defanition word for wors from Wakopedia's definition for Homophobia & just changed the Homo/Christos & Homo/ Christian parts. For this I received the following note from the admin This is the last warning you will receive for your disruptive edits, such as those you made to Bash Back. If you vandalize Wikipedia again, you will be blocked from editing. Please maintain a neutral point of view. ukexpat (talk) Funny how liberals love to censor everyone & then call us evil""",""" wikipedia's censorship I looked up Bash Back (The sodomite terrorist group that attacked the Church in Michigan) on wikipedia (just to see what liberal bias they would have. Under the section """"Antifascist action at 2008 Milwaukee Pridefest"""" Wakopedia says """"In response, Bash Back! Milwaukee planned a confrontation of the hate group."""" the """"hate group they are talking about is the NAZIs but then if the NAZIs are a hate group then Bash back must also be a hate group so I edited it to say """"In response, The hate group Bash Back! Milwaukee planned a confrontation of the hate group."""" Since they also did not have any topic for the new terrorist attack on the church I added a few lines & was attacked by the Admins. So they added the following paragraph 2008 Disruption at Mt. Hope Church in Lansing, MI Bash Back! members disrupted a Sunday sermon at Mt. Hope Church in Lansing, Michigan on November 9th, 2008. They dropped a banner in the church, threw thousands of fliers, made out in front of the congregation, yelled """"Jesus was a [slur for homosexuals]"","" pulled the fire alarm and initiated pro-gay chants. Mt. Hope is an Assemblies of God church which holds and promotes """"ex-gay"""" events and preaches that homosexuality is a sin. I edited the first line of the paragraph to read """"Christaphobic Bash Back! members disrupted... & added the following footnote """"Christphobia (from Greek Christos: Messiah; phobos: fear, phobia) is an irrational fear of, aversion to, or discrimination against Christianity."""" I took the defanition word for wors from Wakopedia's definition for Homophobia & just changed the [slur for homosexuals]/Christos & [slur for homosexuals]/ Christian parts. For this I received the following note from the admin This the last warning you will receive for your disruptive edits, such as those you made to Bash Back. If you vandalize Wikipedia again, you will be blocked from editing. Please maintain a neutral point of view. ukexpat (talk) Funny how liberals love to censor everyone & then call us evil""",wikipedia s censorship i look up bash back the sodomite terrorist group that attack the church in michigan on wikipedia just to see what liberal bias they would have under the section antifascist action at 2008 milwaukee pridefest wakopedia say in response bash back milwaukee plan a confrontation of the hate group the hate group they be talk about be the nazis but then if the nazis be a hate group then bash back must also be a hate group so i edit it to say in response the hate group bash back milwaukee plan a confrontation of the hate group since they also do not have any topic for the new terrorist attack on the church i add a few line be attack by the admins so they add the follow paragraph 2008 disruption at mt hope church in lansing mi bash back member disrupt a sunday sermon at mt hope church in lansing michigan on november 9th 2008 they drop a banner in the church throw thousand of flier make out in front of the congregation yell jesus be a slur for homosexual pull the fire alarm and initiate pro gay chant mt hope be an assemblies of god church which hold and promote ex gay event and preach that homosexuality be a sin i edit the first line of the paragraph to read christaphobic bash back member disrupt add the following footnote christphobia from greek christos messiah phobos fear phobia be an irrational fear of aversion to or discrimination against christianity i take the defanition word for wor from wakopedia s definition for homophobia just change the slur for homosexual christos slur for homosexual christian part for this i receive the follow note from the admin this the last warning you will receive for your disruptive edit such as those you make to bash back if you vandalize wikipedia again you will be block from edit please maintain a neutral point of view ukexpat talk funny how liberal love to censor everyone then call we evil
580,"Speedy Deletion Nomination\n\nI see that you tagged one of my pages for speedy deletion, despite it being of acceptable Wikipedia quality (crap, but still better then the vast majority on this appalling site). It seems that there is a hierarchy at Wikipedia that just shows the high level of discrimination us ordinary users have to put up with. The admins use bullying tactics to stop acceptable quality articles, such as my latest entry . This must be stopped if Wikipedia is to move forward.","Speedy Deletion Nomination I see that you tagged one of my pages for speedy deletion, despite it being of acceptable Wikipedia quality (crap, but still better then the vast majority on this appalling site). It seems that there is a hierarchy at Wikipedia that just shows the high level of discrimination us ordinary users have to put up with. The admins use bullying tactics to stop acceptable quality articles, such as my latest entry . This must be stopped if Wikipedia is to move forward.","Speedy Deletion Nomination I see that you tagged one of my pages for speedy deletion, despite it being of acceptable Wikipedia quality (crap, but still better then the vast majority on this appalling site). It seems that there is a hierarchy at Wikipedia that just shows the high level of discrimination us ordinary users have to put up with. The admins use bullying tactics to stop acceptable quality articles, such as my latest entry . This must be stopped if Wikipedia is to move forward.",speedy deletion nomination i see that you tag one of my page for speedy deletion despite it be of acceptable wikipedia quality crap but still well then the vast majority on this appalling site it seem that there be a hierarchy at wikipedia that just show the high level of discrimination we ordinary user have to put up with the admin use bully tactic to stop acceptable quality article such as my late entry this must be stop if wikipedia be to move forward


# Review data

In [9]:
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
 3   text3   14251 non-null  object
dtypes: object(4)
memory usage: 445.5+ KB


In [10]:
%%time
df[cols].to_parquet("output/pre_val.parquet", index=False)

CPU times: user 109 ms, sys: 31.2 ms, total: 141 ms
Wall time: 159 ms
