In [1]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
%%time
df = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB
Wall time: 216 ms


In [4]:
texts = set(df["less_toxic"]) | set(df["more_toxic"])
texts = list(texts)
df = pd.DataFrame(data={"text": texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
dtypes: object(1)
memory usage: 111.5+ KB


# Preprocess Text
Speed dropped from 1200 to 1000 it/s

In [5]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)

text1


100%|████████████████████████████████████████| 14251/14251 [00:46<00:00, 307.12it/s]


text2


100%|█████████████████████████████████████████| 14251/14251 [04:15<00:00, 55.88it/s]


In [6]:
cols = ["text", "text1", "text2"]
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2
6894,tide rolls is a fucking moron he needs to go to HELL,tide rolls is a fucking moron he needs to go to HELL,tide rolls is a fucking moron he needs to go to HELL
10884,"except the Gay Republican bit, which is made up)","except the Gay Republican bit, which is made up)","except the Gay Republican bit, which is made up)"
6684,that you are a homosexual who is very gay.,that you are a homosexual who is very gay.,that you are a homosexual who is very gay.
2132,"""Disgusting==\nThis guy is a disgusting sex columnist. He revels in it. He does it on purpose, and it's what gives him and his column their edge. It seperates him from every other sex advice giver I've ever read and it's his signature attitude, tone, and approach to the issues. But there seems to be no way to adduce this highly pertinent and relevant information without violating the lame, loose, sloppy, ill-thought-out NPOV rule. Hilariously (and disgustingly), people tell me providing this information violates article """"neutrality."""" Any help here by any person of quality, integrity, and intelligence would be appreciated. —Preceding unsigned comment added by 24.44.129.156 \nImagine that a sex columnist who enjoys sex. How can we work that into the article? ;-) \n\n==""","""Disgusting== This guy is a disgusting sex columnist. He revels in it. He does it on purpose, and it's what gives him and his column their edge. It seperates him from every other sex advice giver I've ever read and it's his signature attitude, tone, and approach to the issues. But there seems to be no way to adduce this highly pertinent and relevant information without violating the lame, loose, sloppy, ill-thought-out NPOV rule. Hilariously (and disgustingly), people tell me providing this information violates article """"neutrality."""" Any help here by any person of quality, integrity, and intelligence would be appreciated. - Preceding unsigned comment added by 24.44.129.156 Imagine that a sex columnist who enjoys sex. How can we work that into the article? [Wink or smirk] ==""","""Disgusting== This guy is a disgusting sex columnist. He revels in it. He does it on purpose, and it is what gives him and his column their edge. It seperates him from every other sex advice giver I have ever read and it is his signature attitude, tone, and approach to the issues. But there seems to be no way to adduce this highly pertinent and relevant information without violating the lame, loose, sloppy, ill-thought-out NPOV rule. Hilariously (and disgustingly), people tell me providing this information violates article """"neutrality."""" Any help here by any person of quality, integrity, and intelligence would be appreciated. - Preceding unsigned comment added by 24.44.129.156 Imagine that a sex columnist who enjoys sex. How can we work that into the article? [Wink or smirk] =="""
2475,"Your intelligence, insight, and manners are quite poor. And YOU'RE the vandal, obviously. Don't waste my time, loser.","Your intelligence, insight, and manners are quite poor. And YOU'RE the vandal, obviously. Don't waste my time, loser.","Your intelligence, insight, and manners are quite poor. And you are the vandal, obviously. do not waste my time, loser."
6598,"Her blatant bias needs to be mentioned in the article \n\nMegan Kelly is, without a doubt, the most vitriolic, pro-Republican, anti-anything/anyone-who-isn't-Republican female pundit on Fox News. It is sickening to me (and to most rational human beings) that she is even considered a journalist. Brit Hume is a journalist. Shepard Smith is a journalist. George Will is a journalist. Megyn Kelly is a Republican Kool Aid drinker with a microphone and a camera. Now, I'm anticipating that some pro-Megyn zealot will revert my edit that she is a conservative so please consider why I made the edit - because it's valid. And lest you think I'm some liberal Fox hater, I would also like to see Keith Olbermann be called a liberal once and for all on Wikipedia.","Her blatant bias needs to be mentioned in the article Megan Kelly is, without a doubt, the most vitriolic, pro-Republican, anti-anything/anyone-who-isn't-Republican female pundit on Fox News. It is sickening to me (and to most rational human beings) that she is even considered a journalist. Brit Hume is a journalist. Shepard Smith is a journalist. George Will is a journalist. Megyn Kelly is a Republican Kool Aid drinker with a microphone and a camera. Now, I'm anticipating that some pro-Megyn zealot will revert my edit that she is a conservative so please consider why I made the edit - because it's valid. And lest you think I'm some liberal Fox hater, I would also like to see Keith Olbermann be called a liberal once and for all on Wikipedia.","Her blatant bias needs to be mentioned in the article Megan Kelly is, without a doubt, the most vitriolic, pro-Republican, anti-anything/anyone-who-is not-Republican female pundit on Fox News. It is sickening to me (and to most rational human beings) that she is even considered a journalist. Brit Hume is a journalist. Shepard Smith is a journalist. George Will is a journalist. Megyn Kelly is a Republican Kool Aid drinker with a microphone and a camera. Now, I am anticipating that some pro-Megyn zealot will revert my edit that she is a conservative so please consider why I made the edit - because it is valid. And lest you think I am some liberal Fox hater, I would also like to see Keith Olbermann be called a liberal once and for all on Wikipedia."
7090,"Incidently, the ignorant only ignores the truth.","Incidently, the ignorant only ignores the truth.","Incidently, the ignorant only ignores the truth."
5881,"98.248.32.178 I will set you on fire, I will shoot your ass up. I will cut your penis off and I will shove it down your throat and choke you. I will cut you up big time motherfucker.","98.248.32.178 I will set you on fire, I will shoot your ass up. I will cut your penis off and I will shove it down your throat and choke you. I will cut you up big time motherfucker.","98.248.32.178 I will set you on fire, I will shoot your ass up. I will cut your penis off and I will shove it down your throat and choke you. I will cut you up big time motherfucker."
1658,ZHANZHAO IS A MOTHERFUCKING PUSSY INDIAN-CHINESE BASTARD! http://twitter.com/phoenixreporter \n\n{{unblock|http://twitter.com/phoenixreporter\nhttp://twitter.com/phoenixreporter\nhttp://twitter.com/phoenixreporter\nhttp://twitter.com/phoenixreporter\n\nI AM GONNA FLOOD THIS PAGE!!! WAHAHHAHAAHAH ZHAN ZHAO IS A MOTHERFUCKING }|,ZHANZHAO IS A MOTHERFUCKING PUSSY INDIAN-CHINESE BASTARD! {{unblock| I AM GONNA FLOOD THIS PAGE!!! WAHAHHAHAAHAH ZHAN ZHAO IS A MOTHERFUCKING }|,ZHANZHAO IS A MOTHERFUCKING PUSSY INDIAN-CHINESE BASTARD! {{unblock| I AM going to FLOOD THIS PAGE!!! WAHAHHAHAAHAH ZHAN ZHAO IS A MOTHERFUCKING }|
9506,"you're id.iot, vascoamaral is s'tu.pi.d","you're id.iot, vascoamaral is s'tu.pi.d","you are id.iot, vascoamaral is s'tu.pi.d"


In [7]:
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2
1508,"By saying Jubilees is not canonical in any mainstream denomination, you are revealing your abject ignorance and bias POV, because while your personal religion may not consider it canonical, the Ethiopian Orthodox Christians and Ethiopian Jews do, and they are Abrahamic. The Orthodox Christians make up a majority in Ethiopia. The Constantinian Christians in the Roman Empire, and the Pharisee Sanhedrin both tried to do away with Jubilees, but wikipedia is neutral and does not subscribe to any POV in disputes like that, nor will it assist you in attemting to marginalize Ethiopians' religious beliefs.","By saying Jubilees is not canonical in any mainstream denomination, you are revealing your abject ignorance and bias POV, because while your personal religion may not consider it canonical, the Ethiopian Orthodox Christians and Ethiopian Jews do, and they are Abrahamic. The Orthodox Christians make up a majority in Ethiopia. The Constantinian Christians in the Roman Empire, and the Pharisee Sanhedrin both tried to do away with Jubilees, but wikipedia is neutral and does not subscribe to any POV in disputes like that, nor will it assist you in attemting to marginalize Ethiopians' religious beliefs.","By saying Jubilees is not canonical in any mainstream denomination, you are revealing your abject ignorance and bias [point of view], because while your personal religion may not consider it canonical, the Ethiopian Orthodox Christians and Ethiopian Jews do, and they are Abrahamic. The Orthodox Christians make up a majority in Ethiopia. The Constantinian Christians in the Roman Empire, and the Pharisee Sanhedrin both tried to do away with Jubilees, but wikipedia is neutral and does not subscribe to any [point of view] in disputes like that, nor will it assist you in attemting to marginalize Ethiopians' religious beliefs."
4318,"Don't you have anything better to do? \n\nI mean really, you love to shove your opinion down other's throats and go around censoring completely relevant comments. Get off my dick.","Don't you have anything better to do? I mean really, you love to shove your opinion down other's throats and go around censoring completely relevant comments. Get off my dick.","do not you have anything better to do? I mean really, you love to shove your opinion down other's throats and go around censoring completely relevant comments. Get off my dick."
11382,"""\n\n request for arbitrage by non greek people \nI request arbitrage for the following:\nMore then once Greek nationalistic editors removed my contributions\nI want non greek people to look into this. It can't be that Wikipedia turns into a greek extremistic and nationalistic propaganda forum\nI bought an atlas in witch you can see that the cham area belonged to Albania. (NOM ROBBE, M. Géographie de M. Robbe. Méthode pour apprendre facilement la Géographie, divisée en deux Tomes. (Tome Premier). Contenant un Abbrégé de la Sphère, la Division de la Terre)\nThe article below describes how the cham were expelled by the greek.\n\n Napoleon Zervas \n\n""""Zervas was forced to resign in 1947, when details of his contacts with the German occupation authorities surfaced"""" This quote came from Wikepedia's biography of Napoleon Zervas. Other sites are more explicit and call him a Nazi collaborateur. Nowadays there are still a lot of greek people that believe that the Cham people were expelled because they collaborated with the Nazi's. However it was Zervas thet collaborated with the Nazi's that expelled them. Shouldn't this be more explicit in this article. There is a UN resolution which asks the Greek government to repatriate the cham and to give back there properties.The rest of this article is lame. I miss the following:\n\nCopyright © 2006 Illyrians.org.\nDuring the Conference of Ambassadors in London in 1913, the southern part of the region was cut off from the motherland and annexed to the Greek state despite the fact that people of the southern Epirus were Albanians of Orthodox and the Muslim faith. While the orthodox Albanians were targets of hellenization, the muslim Albanians were either exterminated or expelled from their ancestral lands by the Greek government.\n\nChams who lived in Southern Epirus (Chameria or Thesprotia as it is called by the Greeks) were the victims of the first ethnic cleansing in Europe at the end of the Second World War. The Cham tragedy is one of the most painful tragedies of the European continent. Statistical yearbook of the Greek government in 1936 showed that 26.000 Chams lived in Chameria region in Greece at that time.\n\nAs a result of the 1944-1945 ethnic cleansing and genocide, 30.000 Albanian Muslims were violently expelled from the Chameria region, and sought refuge in the Republic of Albania, where they still live. Today, there are 150.000 members of this population in Albania, a figure that has grown because of the high birth rate of the population. On the other hand, current number of Cham Albanians living in Greece is estimated at around 100.000. Yet these people are deprived of every sort of minority rights like other minorities living in Greece. To cite but one example, they can speak Albanian only in their homes.\n\nThe population of Chameria has always been ethnically Albanian: - A lot of voyagers and foreign historians wrote that Chameria had been populated by Albanians. Even the Greek historian Herodotus underscored this fact in his book Historias and called Albanians of the Chameria barbarians, a term used by the ancient Greeks to distinguish non-greek people. - The census held by the Turkish Administration in 1910 established that there were 83.000 orthodox and muslim Albanians in the region. The demographic map of the British military mission sent to the British government in London indicates that on the eve of the second World War, 75% of Chameria's population was Albanian. - The pro-Greek historian Spiro Muselimi, in his book """"Historical Sight Through Thesprotia"""", edited in Joannina on 1974, wrote that """"The bishop of Thesprotia in the year 1870 translated some parts of Bible into Albanian, as the people of orthodox faith of the region did not understand any word in Greek"""" .\n\nThe Greek authorities, sticking to the concept of absolute denial of the existence of ethnic groups on Greek territory, have followed a well-established chauvinistic policy and, as history recorded, they committed genocide against the Albanians of muslim faith. The racial assault on Chameria's muslim Albanians began to be first applied at the end of the Second World War, in 1944-1945, when criminal bands of the notorious General Napoleon Zervas perpetrated ethnic cleansing against them.\n\nOn June 27, 1944, Greek criminal bands resorted to the worst atrocities witnessed in this region. The terror committed against this population was beyond description. It included killings, rapes, inhuman treatment, massacre of women, babies and pregnant women. More than 1400 men, women and children were killed within 24 hours in the town of Paramithy, on Tuesday, June 27, 1944, which happened to be the date of St.Bartholomeus day for the whole Chameria.\n\nDuring the June 1944-March 1945 period, 1286 persons were killed in Filat, 192 people were killed",""" request for arbitrage by non greek people I request arbitrage for the following: More then once Greek nationalistic editors removed my contributions I want non greek people to look into this. It can't be that Wikipedia turns into a greek extremistic and nationalistic propaganda forum I bought an atlas in witch you can see that the cham area belonged to Albania. (NOM ROBBE, M. Geographie de M. Robbe. Methode pour apprendre facilement la Geographie, divisee en deux Tomes. (Tome Premier). Contenant un Abbrege de la Sphere, la Division de la Terre) The article below describes how the cham were expelled by the greek. Napoleon Zervas """"Zervas was forced to resign in 1947, when details of his contacts with the German occupation authorities surfaced"""" This quote came from Wikepedia's biography of Napoleon Zervas. Other sites are more explicit and call him a Nazi collaborateur. Nowadays there are still a lot of greek people that believe that the Cham people were expelled because they collaborated with the Nazi's. However it was Zervas thet collaborated with the Nazi's that expelled them. Shouldn't this be more explicit in this article. There is a UN resolution which asks the Greek government to repatriate the cham and to give back there properties.The rest of this article is lame. I miss the following (Copyright) copyright: 2006 Illyrians.org. During the Conference of Ambassadors in London in 1913, the southern part of the region was cut off from the motherland and annexed to the Greek state despite the fact that people of the southern Epirus were Albanians of Orthodox and the Muslim faith. While the orthodox Albanians were targets of hellenization, the muslim Albanians were either exterminated or expelled from their ancestral lands by the Greek government. Chams who lived in Southern Epirus (Chameria or Thesprotia as it is called by the Greeks) were the victims of the first ethnic cleansing in Europe at the end of the Second World War. The Cham tragedy is one of the most painful tragedies of the European continent. Statistical yearbook of the Greek government in 1936 showed that 26.000 Chams lived in Chameria region in Greece at that time. As a result of the 1944-1945 ethnic cleansing and genocide, 30.000 Albanian Muslims were violently expelled from the Chameria region, and sought refuge in the Republic of Albania, where they still live. Today, there are 150.000 members of this population in Albania, a figure that has grown because of the high birth rate of the population. On the other hand, current number of Cham Albanians living in Greece is estimated at around 100.000. Yet these people are deprived of every sort of minority rights like other minorities living in Greece. To cite but one example, they can speak Albanian only in their homes. The population of Chameria has always been ethnically Albanian: - A lot of voyagers and foreign historians wrote that Chameria had been populated by Albanians. Even the Greek historian Herodotus underscored this fact in his book Historias and called Albanians of the Chameria barbarians, a term used by the ancient Greeks to distinguish non-greek people. - The census held by the Turkish Administration in 1910 established that there were 83.000 orthodox and muslim Albanians in the region. The demographic map of the British military mission sent to the British government in London indicates that on the eve of the second World War, 75% of Chameria's population was Albanian. - The pro-Greek historian Spiro Muselimi, in his book """"Historical Sight Through Thesprotia"","" edited in Joannina on 1974, wrote that """"The bishop of Thesprotia in the year 1870 translated some parts of Bible into Albanian, as the people of orthodox faith of the region did not understand any word in Greek"""" . The Greek authorities, sticking to the concept of absolute denial of the existence of ethnic groups on Greek territory, have followed a well-established chauvinistic policy and, as history recorded, they committed genocide against the Albanians of muslim faith. The racial assault on Chameria's muslim Albanians began to be first applied at the end of the Second World War, in 1944-1945, when criminal bands of the notorious General Napoleon Zervas perpetrated ethnic cleansing against them. On June 27, 1944, Greek criminal bands resorted to the worst atrocities witnessed in this region. The terror committed against this population was beyond description. It included killings, rapes, inhuman treatment, massacre of women, babies and pregnant women. More than 1400 men, women and children were killed within 24 hours in the town of Paramithy, on Tuesday, June 27, 1944, which happened to be the date of St.Bartholomeus day for the whole Chameria. During the June 1944-March 1945 period, 1286 persons were killed in Filat, 192 people were killed",""" request for arbitrage by non greek people I request arbitrage for the following: More then once Greek nationalistic editors removed my contributions I want non greek people to look into this. It cannot be that Wikipedia turns into a greek extremistic and nationalistic propaganda forum I bought an atlas in witch you can see that the cham area belonged to Albania. ([sound made when eating something] ROBBE, M. Geographie de M. Robbe. Methode pour apprendre facilement la Geographie, divisee en deux Tomes. (Tome Premier). Contenant un Abbrege de la Sphere, la Division de la Terre) The article below describes how the cham were expelled by the greek. Napoleon Zervas """"Zervas was forced to resign in 1947, when details of his contacts with the German occupation authorities surfaced"""" This quote came from Wikepedia's biography of Napoleon Zervas. Other sites are more explicit and call him a Nazi collaborateur. Nowadays there are still a lot of greek people that believe that the Cham people were expelled because they collaborated with the Nazi's. However it was Zervas thet collaborated with the Nazi's that expelled them. should not this be more explicit in this article. There is a UN resolution which asks the Greek government to repatriate the cham and to give back there properties.The rest of this article is lame. I miss the following (Copyright) copyright: 2006 Illyrians.org. During the Conference of Ambassadors in London in 1913, the southern part of the region was cut off from the motherland and annexed to the Greek state despite the fact that people of the southern Epirus were Albanians of Orthodox and the Muslim faith. While the orthodox Albanians were targets of hellenization, the muslim Albanians were either exterminated or expelled from their ancestral lands by the Greek government. Chams who lived in Southern Epirus (Chameria or Thesprotia as it is called by the Greeks) were the victims of the first ethnic cleansing in Europe at the end of the Second World War. The Cham tragedy is one of the most painful tragedies of the European continent. Statistical yearbook of the Greek government in 1936 showed that 26.000 Chams lived in Chameria region in Greece at that time. As a result of the 1944-1945 ethnic cleansing and genocide, 30.000 Albanian Muslims were violently expelled from the Chameria region, and sought refuge in the Republic of Albania, where they still live. Today, there are 150.000 members of this population in Albania, a figure that has grown because of the high birth rate of the population. On the other hand, current number of Cham Albanians living in Greece is estimated at around 100.000. Yet these people are deprived of every sort of minority rights like other minorities living in Greece. To cite but one example, they can speak Albanian only in their homes. The population of Chameria has always been ethnically Albanian: - A lot of voyagers and foreign historians wrote that Chameria had been populated by Albanians. Even the Greek historian Herodotus underscored this fact in his book Historias and called Albanians of the Chameria barbarians, a term used by the ancient Greeks to distinguish non-greek people. - The census held by the Turkish Administration in 1910 established that there were 83.000 orthodox and muslim Albanians in the region. The demographic map of the British military mission sent to the British government in London indicates that on the eve of the second World War, 75% of Chameria's population was Albanian. - The pro-Greek historian Spiro Muselimi, in his book """"Historical Sight Through Thesprotia"","" edited in Joannina on 1974, wrote that """"The bishop of Thesprotia in the year 1870 translated some parts of Bible into Albanian, as the people of orthodox faith of the region did not understand any word in Greek"""" . The Greek authorities, sticking to the concept of absolute denial of the existence of ethnic groups on Greek territory, have followed a well-established chauvinistic policy and, as history recorded, they committed genocide against the Albanians of muslim faith. The racial assault on Chameria's muslim Albanians began to be first applied at the end of the Second World War, in 1944-1945, when criminal bands of the notorious General Napoleon Zervas perpetrated ethnic cleansing against them. On June 27, 1944, Greek criminal bands resorted to the worst atrocities witnessed in this region. The terror committed against this population was beyond description. It included killings, rapes, inhuman treatment, massacre of women, babies and pregnant women. More than 1400 men, women and children were killed within 24 hours in the town of Paramithy, on Tuesday, June 27, 1944, which happened to be the date of St.Bartholomeus day for the whole Chameria. During the June 1944-March 1945 period, 1286 persons were killed in Filat, 192 people were killed"
11888,"JOSSI FRESCO IS A HYPOCRITE SOMEONE SAVE THIS ARTICLE!!!!!! \n\nIf there's a single Wiki admins out there with a bit of backbone willing to stand up to one of your fellow Wiki admins, who just happens to be a follower of a man many still worhship as God in human form, who just happens to be editing all articles about that man as anyone might expect, won't you please do something here?!\n\nIf Wikipedia does not yet have a rule for barring individuals from working on articles where they're persistently demonstrated lack of good faith, it should. We all know that this is one of the serious weaknesses of Wikipedia. That doesn't mean you should be resigned to the status quo. Jossi Fresco's involvement re Rawat should read as the textbook example of what went wrong and with a little luck what was eventually done to fix it. When cult leaders or other infamous characters, especially wealthy ones who can afford the service, send their paid webmasters to stand guard over their articles, it's a problem. Jossi Fresco here is a problem. If this were Brittanica they'd lock the door. It's not but a little humility please, Wiki admins. Brittanica's been around a lot, lot longer than this site and they might have done something right over the last hundred years or so.","JOSSI FRESCO IS A HYPOCRITE SOMEONE SAVE THIS ARTICLE!!!!!! If there's a single Wiki admins out there with a bit of backbone willing to stand up to one of your fellow Wiki admins, who just happens to be a follower of a man many still worhship as God in human form, who just happens to be editing all articles about that man as anyone might expect, won't you please do something here?! If Wikipedia does not yet have a rule for barring individuals from working on articles where they're persistently demonstrated lack of good faith, it should. We all know that this is one of the serious weaknesses of Wikipedia. That doesn't mean you should be resigned to the status quo. Jossi Fresco's involvement re Rawat should read as the textbook example of what went wrong and with a little luck what was eventually done to fix it. When cult leaders or other infamous characters, especially wealthy ones who can afford the service, send their paid webmasters to stand guard over their articles, it's a problem. Jossi Fresco here is a problem. If this were Brittanica they'd lock the door. It's not but a little humility please, Wiki admins. Brittanica's been around a lot, lot longer than this site and they might have done something right over the last hundred years or so.","JOSSI FRESCO IS A HYPOCRITE SOMEONE SAVE THIS ARTICLE!!! If there is a single [a website or database developed collaboratively by an online community] admins out there with a bit of backbone willing to stand up to one of your fellow [a website or database developed collaboratively by an online community] admins, who just happens to be a follower of a man many still worhship as God in human form, who just happens to be editing all articles about that man as anyone might expect, will not you please do something here?! If Wikipedia does not yet have a rule for barring individuals from working on articles where they are persistently demonstrated lack of good faith, it should. We all know that this is one of the serious weaknesses of Wikipedia. That does not mean you should be resigned to the status quo. Jossi Fresco's involvement re Rawat should read as the textbook example of what went wrong and with a little luck what was eventually done to fix it. When cult leaders or other infamous characters, especially wealthy ones who can afford the service, send their paid webmasters to stand guard over their articles, it is a problem. Jossi Fresco here is a problem. If this were Brittanica they would lock the door. it is not but a little humility please, [a website or database developed collaboratively by an online community] admins. Brittanica's been around a lot, lot longer than this site and they might have done something right over the last hundred years or so."
13298,"""\n{{userbox \n TABTAB| id = \n TABTAB| id-c = White \n TABTAB| info = This user is a faggot. \n TABTAB| info-c = LightBlue \n TABTAB| border-c = Black \n TABTAB| usercategory = LGBT Wikipedians \n TABTAB| nocat = \n TABTAB}} \n TABTAB\n TABTAB- ''This template automatically categorizes the user in LGBT""",""" {{userbox TABTAB| id = TABTAB| id-c = White TABTAB| info = This user is a faggot. TABTAB| info-c = LightBlue TABTAB| border-c = Black TABTAB| usercategory = LGBT Wikipedians TABTAB| nocat = TABTAB}} TABTAB TABTAB- "" This template automatically categorizes the user in LGBT""",""" {{userbox TABTAB| id = TABTAB| id-c = White TABTAB| info = This user is a faggot. TABTAB| info-c = LightBlue TABTAB| border-c = Black TABTAB| usercategory = [lesbian, gay, bisexual, transgender and queer or questioning community] Wikipedians TABTAB| nocat = TABTAB}} TABTAB TABTAB- "" This template automatically categorizes the user in [lesbian, gay, bisexual, transgender and queer or questioning community]"""
5423,Photo \n\nYou can see her tit.69.243.42.251,Photo You can see her tit.69.243.42.251,Photo You can see her tit.69.243.42.251
13346,"Delicious carbuncle, please read my statementit is no joke. The fun has stopped now. I apologized and wanted to make peace with the people who followed us for the past three years, ever since I exposed them on Blogcritics on my first blog.\nAfter having received emails which threatened me and which also asked me to kill myself, I think this entire case has taken on another level.\n\nThis is no joke, Delicious carbuncle and it is sad that you are still hiding behind a silly name.\nWhat is the sense behind this. Do you not realize what harm a Lori Drew did to others?\n\nWe all know how you defend your right to remain ANONYMOUS, but I have been threatened with physical harm, and I have reason to believe that YOU, Delicious carbuncle are one of the six people who wanted to come to my home, tie me up on a chair and duct tape me, since another member just revealed all the members of this vicious Marilyn Monroe fan group gang of obsessive fanatics with the psychopath characteristics.\n\nPlease stop your phony edits, and stick to the truth. \n\nThank you.\n\nMark Bellinghaus\n\nhttp://blog.ourmarilyn.com/2008/12/05/mark-bellinghaus.aspx\n\n76.168.70.147","Delicious carbuncle, please read my statementit is no joke. The fun has stopped now. I apologized and wanted to make peace with the people who followed us for the past three years, ever since I exposed them on Blogcritics on my first blog. After having received emails which threatened me and which also asked me to kill myself, I think this entire case has taken on another level. This is no joke, Delicious carbuncle and it is sad that you are still hiding behind a silly name. What is the sense behind this. Do you not realize what harm a Lori Drew did to others? We all know how you defend your right to remain ANONYMOUS, but I have been threatened with physical harm, and I have reason to believe that YOU, Delicious carbuncle are one of the six people who wanted to come to my home, tie me up on a chair and duct tape me, since another member just revealed all the members of this vicious Marilyn Monroe fan group gang of obsessive fanatics with the psychopath characteristics. Please stop your phony edits, and stick to the truth. Thank you. Mark Bellinghaus 76.168.70.147","Delicious carbuncle, please read my statementit is no joke. The fun has stopped now. I apologized and wanted to make peace with the people who followed us for the past three years, ever since I exposed them on Blogcritics on my first blog. After having received emails which threatened me and which also asked me to kill myself, I think this entire case has taken on another level. This is no joke, Delicious carbuncle and it is sad that you are still hiding behind a silly name. What is the sense behind this. Do you not realize what harm a Lori Drew did to others? We all know how you defend your right to remain ANONYMOUS, but I have been threatened with physical harm, and I have reason to believe that YOU, Delicious carbuncle are one of the six people who wanted to come to my home, tie me up on a chair and duct tape me, since another member just revealed all the members of this vicious Marilyn Monroe fan group gang of obsessive fanatics with the psychopath characteristics. Please stop your phony edits, and stick to the truth. Thank you. Mark Bellinghaus 76.168.70.147"
768,"Excuse me \n\nPardon me, but I was accused of spamming for linking articles together like we are supposed to here at Wikipedia. If that is not a personal attack on me, I do not know what is. I can respond any way I please to an attack on my character. And I have HAD IT with power drunk folks as yourself threatening me. There was nothing to discuss with those people. He BLANKED a section of information. BLANKING is vandalism. If you do not understand that, your credentials should be revoked. You do not deserve your position.","Excuse me Pardon me, but I was accused of spamming for linking articles together like we are supposed to here at Wikipedia. If that is not a personal attack on me, I do not know what is. I can respond any way I please to an attack on my character. And I have HAD IT with power drunk folks as yourself threatening me. There was nothing to discuss with those people. He BLANKED a section of information. BLANKING is vandalism. If you do not understand that, your credentials should be revoked. You do not deserve your position.","Excuse me Pardon me, but I was accused of spamming for linking articles together like we are supposed to here at Wikipedia. If that is not a personal attack on me, I do not know what is. I can respond any way I please to an attack on my character. And I have HAD IT with power drunk folks as yourself threatening me. There was nothing to discuss with those people. He BLANKED a section of information. BLANKING is vandalism. If you do not understand that, your credentials should be revoked. You do not deserve your position."
11962,"Ah come on. Don't bite the newcomer! Besides, I'm not doing any harm now...while some would argue Wikipedia is doing harm and is in fact communism. I certainly don't publically ascribe to an opinion such as this, which can only really be described as pelican shit.","Ah come on. Don't bite the newcomer! Besides, I'm not doing any harm now...while some would argue Wikipedia is doing harm and is in fact communism. I certainly don't publically ascribe to an opinion such as this, which can only really be described as pelican shit.","Ah come on. do not bite the newcomer! Besides, I am not doing any harm now...while some would argue Wikipedia is doing harm and is in fact communism. I certainly do not publically ascribe to an opinion such as this, which can only really be described as pelican shit."
10611,I Hate You\n\nAnd I hope you die in a fire. 128.61.128.159,I Hate You And I hope you die in a fire. 128.61.128.159,I Hate You And I hope you die in a fire. 128.61.128.159


In [8]:
df[cols].sample(10).head(10)

Unnamed: 0,text,text1,text2
12309,"other rules. I take allegations of homosexuality about English monarchs very seriously, but you don't care, do you?","other rules. I take allegations of homosexuality about English monarchs very seriously, but you don't care, do you?","other rules. I take allegations of homosexuality about English monarchs very seriously, but you do not care, do you?"
8312,I went ahead and shortened the stuff on abortion and euthanasia.,I went ahead and shortened the stuff on abortion and euthanasia.,I went ahead and shortened the stuff on abortion and euthanasia.
7584,Hello I am the one known as the vandal person...you may call me kanden7 pf from now on. I see you mean business ronbo ). Don't you think putting me on that page with the Mega vandals is a bit extreme???,Hello I am the one known as the vandal person...you may call me kanden7 pf from now on. I see you mean business ronbo). Don't you think putting me on that page with the Mega vandals is a bit extreme???,Hello I am the one known as the vandal person...you may call me kanden7 pf from now on. I see you mean business ronbo). do not you think putting me on that page with the Mega vandals is a bit extreme???
7656,"""\nI'm afraid I don't see the problem you seem to have. The table looks OK even at low (<900px wide) resolutions. Your comment that """"adding extra columns which have no content other than N/A is ridiculous as it shows absolutely no information of any kind"""" ignores the fact that there is content in these columns. Obviously there is presently none for seasons that haven't been released on Blu-ray but this is something that exists now in many tables for DVD entries. As Blu-ray versions are released, this information will be supplied. A separate Blu-ray table would unnecessarily duplicate content ad somebody would no doubt complain about that. An all inclusive table seems the best compromise. """,""" I'm afraid I don't see the problem you seem to have. The table looks OK even at low (<900px wide) resolutions. Your comment that """"adding extra columns which have no content other than N/A is ridiculous as it shows absolutely no information of any kind"""" ignores the fact that there is content in these columns. Obviously there is presently none for seasons that haven't been released on Blu-ray but this is something that exists now in many tables for DVD entries. As Blu-ray versions are released, this information will be supplied. A separate Blu-ray table would unnecessarily duplicate content ad somebody would no doubt complain about that. An all inclusive table seems the best compromise. """,""" I am afraid I do not see the problem you seem to have. The table looks OK even at low (<900px wide) resolutions. Your comment that """"adding extra columns which have no content other than N/A is ridiculous as it shows absolutely no information of any kind"""" ignores the fact that there is content in these columns. Obviously there is presently none for seasons that have not been released on Blu-ray but this is something that exists now in many tables for DVD entries. As Blu-ray versions are released, this information will be supplied. A separate Blu-ray table would unnecessarily duplicate content ad somebody would no doubt complain about that. An all inclusive table seems the best compromise. """
436,"Thomas Cannon \n\nLaughable commentary from another one of Wikipedia's ignorant, lazy, and arrogant. He deleted my reply on his own Talk page. But of course! He should consider himself lucky to get it. See Point 4 on my User Page","Thomas Cannon Laughable commentary from another one of Wikipedia's ignorant, lazy, and arrogant. He deleted my reply on his own Talk page. But of course! He should consider himself lucky to get it. See Point 4 on my User Page","Thomas Cannon Laughable commentary from another one of Wikipedia's ignorant, lazy, and arrogant. He deleted my reply on his own Talk page. But of course! He should consider himself lucky to get it. See Point 4 on my User Page"
11765,"Get a grip on your trigger happy CSD finger, please \n\nYou put a completely unwarrented CSD notice on Visual Researchers' Society of Canada. Might I suggest instead of spending your time writing off the work of others, you contribute something useful yourself.","Get a grip on your trigger happy CSD finger, please You put a completely unwarrented CSD notice on Visual Researchers' Society of Canada. Might I suggest instead of spending your time writing off the work of others, you contribute something useful yourself.","Get a grip on your trigger happy CSD finger, please You put a completely unwarrented CSD notice on Visual Researchers' Society of Canada. Might I suggest instead of spending your time writing off the work of others, you contribute something useful yourself."
6560,3rd warning \n\nDon't remove warnings with respect to your article deletions! \n 4th warning \nSTOP REMOVING WARNINGS!,3rd warning Don't remove warnings with respect to your article deletions! 4th warning STOP REMOVING WARNINGS!,3rd warning do not remove warnings with respect to your article deletions! 4th warning STOP REMOVING WARNINGS!
14089,wow its obvious koreans cannot make a single edit without cursing,wow its obvious koreans cannot make a single edit without cursing,wow its obvious koreans cannot make a single edit without cursing
2410,"""\n\n """"JdtriI"""" \n\nDo you know what A101 is up to with this account? They seem darn sure its tCv but I haven't seen any evidence beyond coincidence of naming style. """,""" """"JdtriI"""" Do you know what A101 is up to with this account? They seem darn sure its tCv but I haven't seen any evidence beyond coincidence of naming style. """,""" """"JdtriI"""" Do you know what A101 is up to with this account? They seem darn sure its tCv but I have not seen any evidence beyond coincidence of naming style. """
10515,"youre a dumbass liar hectorine, that was about kinross and i already cant change the article..","youre a dumbass liar hectorine, that was about kinross and i already cant change the article..","youre a dumbass liar hectorine, that was about kinross and i already cant change the article.."


# Review data

In [9]:
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
dtypes: object(3)
memory usage: 334.1+ KB


In [10]:
%%time
df[cols].to_parquet("output/pre_val.parquet", index=False)

Wall time: 101 ms
