In [1]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
%%time
df = pd.read_csv("input/js18/train.csv", engine="c", low_memory=False)
tmp = pd.read_csv("input/js18/test.csv", engine="c", low_memory=False)
df = df.append(tmp, ignore_index=True)
df.rename(columns={"comment_text": "text"}, inplace=True)
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
df.drop(columns=["id"] + cols, inplace=True)
df.drop_duplicates(["text"], keep='first', inplace=True, ignore_index=True)
#df[cols] = df[cols].astype(np.int8) 
#df["label_sum"] = df["toxic"] + df["severe_toxic"] + df["obscene"] + df["threat"] + df["insult"] + df["identity_hate"]
#df.drop(index=df[df.label_sum == 0].index, inplace=True)
#df.drop(columns=["label_sum"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312735 entries, 0 to 312734
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    312735 non-null  object
dtypes: object(1)
memory usage: 2.4+ MB
Wall time: 2.14 s


In [4]:
df["length"] = df["text"].str.len()
df["length"].describe(percentiles=percentiles)

count    312735.000000
mean        379.773262
std         591.767791
min           1.000000
1%           20.000000
5%           30.000000
10%          42.000000
20%          71.000000
30%         105.000000
40%         145.000000
50%         193.000000
60%         260.000000
70%         350.000000
80%         502.000000
90%         848.000000
95%        1314.000000
99%        3487.000000
max        5000.000000
Name: length, dtype: float64

In [5]:
df = df[(df["length"] > 30) & (df["length"] < 848)].copy()
df.drop(columns=["length"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 265087 entries, 0 to 312734
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    265087 non-null  object
dtypes: object(1)
memory usage: 4.0+ MB


In [6]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
text,0,0.0,object


# Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


def pre3(row) -> str:
    return mylib.pre3(row["text2"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)
col = "text3"
print(col)
df[col] = df.progress_apply(pre3, axis=1)

text1


100%|██████████████████████████████████████| 265087/265087 [09:24<00:00, 469.36it/s]


text2


100%|█████████████████████████████████████| 265087/265087 [3:23:01<00:00, 21.76it/s]


text3


100%|██████████████████████████████████████| 265087/265087 [44:06<00:00, 100.16it/s]


In [8]:
# Drop texts that are too short
df = df[df["text1"].str.len() > 20].copy()

In [9]:
cols = ["text", "text1", "text2", "text3"]
df[cols].sample(20).head(20)

Unnamed: 0,text,text1,text2,text3
151999,"♥Valentine's Day Couples♥\n(alpahbetical order by last names)\nALTIERI, ANTHONY & ALLYSON FETZER\nBENEDETTO, DAVID & RONALD VARN\nCRIBBEN, EVAN & CHING SHIN SHOO\nDeLONG, ANDREW & JOEL TEAL\nFETZER, ALLISON & MITCH MUIR\nGILL, JEFFERY & MARY J. WANA\nHARTZELL, TIMOTHY & LAURA LEVITT\nICHIKAWA, SHIZUYO & NATHANIEL HUMPHREY\nKARNUTH, GREGORY & MICHEAL KOHLER\nLASLO, ERICA & DANIEL LEEK\nMADDAMMA, JAMES & DENNY LEEK\nPELLEGRINI, JORDAN & KATIE BARTON\nQIN, SIMON & SHIZUYO ICHIKAWA\nRARESHEID, KARA & BRIAN RASILE\nSCHAUB, MEGAN & JOHN RUTH\nSCHILSTRA, KEITH & DYLAN ROSSETT\nSICH, ANNA & HOMER RICHARDS\nTEAL, JOEL & NICHOLAS SINCERE\nUITTO, MATTHEW & VID. E. OJUEGOS","(heart suit) Valentine's Day Couples (heart suit) (alpahbetical order by last names) ALTIERI, ANTHONY & ALLYSON FETZER BENEDETTO, DAVID & RONALD VARN CRIBBEN, EVAN & CHING SHIN SHOO DeLONG, ANDREW & JOEL TEAL FETZER, ALLISON & MITCH MUIR GILL, JEFFERY & MARY J. WANA HARTZELL, TIMOTHY & LAURA LEVITT ICHIKAWA, SHIZUYO & NATHANIEL HUMPHREY KARNUTH, GREGORY & MICHEAL KOHLER LASLO, ERICA & DANIEL LEEK MADDAMMA, JAMES & DENNY LEEK PELLEGRINI, JORDAN & KATIE BARTON QIN, SIMON & SHIZUYO ICHIKAWA RARESHEID, KARA & BRIAN RASILE SCHAUB, MEGAN & JOHN RUTH SCHILSTRA, KEITH & DYLAN ROSSETT SICH, ANNA & HOMER RICHARDS TEAL, JOEL & NICHOLAS SINCERE UITTO, MATTHEW & VID. E. OJUEGOS","(heart suit) Valentine's Day Couples (heart suit) (alpahbetical order by last names) ALTIERI, ANTHONY & ALLYSON FETZER BENEDETTO, DAVID & RONALD VARN CRIBBEN, EVAN & CHING SHIN SHOO DeLONG, ANDREW & JOEL TEAL FETZER, ALLISON & MITCH MUIR GILL, JEFFERY & MARY J. WANA HARTZELL, TIMOTHY & LAURA LEVITT ICHIKAWA, SHIZUYO & NATHANIEL HUMPHREY KARNUTH, GREGORY & MICHEAL KOHLER LASLO, ERICA & DANIEL LEEK MADDAMMA, JAMES & DENNY LEEK PELLEGRINI, JORDAN & KATIE BARTON QIN, SIMON & SHIZUYO ICHIKAWA RARESHEID, KARA & BRIAN RASILE SCHAUB, MEGAN & JOHN RUTH SCHILSTRA, KEITH & DYLAN ROSSETT SICH, ANNA & HOMER RICHARDS TEAL, JOEL & NICHOLAS SINCERE UITTO, MATTHEW & VID. E. OJUEGOS",heart suit valentine s day couples heart suit alpahbetical order by last name altieri anthony allyson fetzer benedetto david ronald varn cribben evan ching shin shoo delong andrew joel teal fetzer allison mitch muir gill jeffery mary j wana hartzell timothy laura levitt ichikawa shizuyo nathaniel humphrey karnuth gregory micheal kohler laslo erica daniel leek maddamma james denny leek pellegrini jordan katie barton qin simon shizuyo ichikawa raresheid kara brian rasile schaub megan john ruth schilstra keith dylan rossett sich anna homer richards teal joel nicholas sincere uitto matthew vid e ojuegos
193499,"Huge confusion here. The Silvia sung by Bjorling, Tauber , McCormack et alii is a Schubert song with Shakespeare's words. Oley Speaks may well have written a song entitled Sylvia, but it was not that one.","Huge confusion here. The Silvia sung by Bjorling, Tauber , McCormack et alii is a Schubert song with Shakespeare's words. Oley Speaks may well have written a song entitled Sylvia, but it was not that one.","Huge confusion here. The Silvia sung by Bjorling, Tauber , McCormack et alii is a Schubert song with Shakespeare's words. Oley Speaks may well have written a song entitled Sylvia, but it was not that one.",huge confusion here the silvia sing by bjorle tauber mccormack et alii be a schubert song with shakespeare s word oley speak may well have write a song entitle sylvia but it be not that one
13600,"Reply -Had the redirects been left in place, the redirect bots would have fixed them. Now that they are gone, they must now be recreated.","Reply -Had the redirects been left in place, the redirect bots would have fixed them. Now that they are gone, they must now be recreated.","Reply -Had the redirects been left in place, the redirect bots would have fixed them. Now that they are gone, they must now be recreated.",reply have the redirect be leave in place the redirect bot would have fix they now that they be go they must now be recreate
30143,"It's not a content dispute. We have a content dispute at slavery in modern africa. After that started (s)he began wikistalking me, undoing my edits and calling all of them vandalism. 79.97.171.208","It's not a content dispute. We have a content dispute at slavery in modern africa. After that started (s) he began wikistalking me, undoing my edits and calling all of them vandalism.","it is not a content dispute. We have a content dispute at slavery in modern africa. After that started (s) he began wikistalking me, undoing my edits and calling all of them vandalism.",it be not a content dispute we have a content dispute at slavery in modern africa after that start s he begin wikistalke i undo my edit and call all of they vandalism
74223,"i am using the sandbox, ass wipe","i am using the sandbox, ass wipe","i am using the sandbox, ass wipe",i be use the sandbox ass wipe
73967,"""\nIt wasn't retconned history. It was just a normal mistake. This is similar to how WWE.com sometimes said that Saliva performed """"Always"""" at WrestleMania X8 when they really performed """"Superstar"""". It’s just that sometimes we all forget things that may have happened in the past. Just a normal mistake. 3:16 """,""" It wasn't retconned history. It was just a normal mistake. This is similar to how WWE.com sometimes said that Saliva performed """"Always"""" at WrestleMania X8 when they really performed """"Superstar""."" It's just that sometimes we all forget things that may have happened in the past. Just a normal mistake. 3:16 """,""" It was not retconned history. It was just a normal mistake. This similar to how WWE.com sometimes said that Saliva performed """"Always"""" at WrestleMania X8 when they really performed """"Superstar""."" it is just that sometimes we all forget things that may have happened in the past. Just a normal mistake. 3:16 """,it be not retconne history it be just a normal mistake this similar to how wwe com sometimes say that saliva perform always at wrestlemania x8 when they really perform superstar it be just that sometimes we all forget thing that may have happen in the past just a normal mistake 3 16
277613,==User:Neillty== \n That user has some very strange article moves. Usual thing with images but doesn't seem to be editing the same articles. Best to just watch for now.,==User:Neillty== That user has some very strange article moves. Usual thing with images but doesn't seem to be editing the same articles. Best to just watch for now.,==User:Neillty== That user has some very strange article moves. Usual thing with images but does not seem to be editing the same articles. Best to just watch for now.,user neillty that user have some very strange article move usual thing with image but do not seem to be edit the same article best to just watch for now
80053,"""\n\n Help? \n\nThe user's literally just said, """"you might as well block me now so as the administrator that responded, surely there is something you can do.... """,""" Help? The user's literally just said, """"you might as well block me now so as the administrator that responded, surely there is something you can do.... """,""" Help? The user's literally just said, """"you might as well block me now so as the administrator that responded, surely there is something you can do... """,help the user s literally just say you might as well block i now so as the administrator that respond surely there be something you can do
215737,"Thank you, 172Talk. It is nice having a debate with you in an unemotional and impersonal manner. \n\n Bublick439","Thank you, 172Talk. It is nice having a debate with you in an unemotional and impersonal manner. Bublick439","Thank you, 172Talk. It is nice having a debate with you in an unemotional and impersonal manner. Bublick439",thank you 172talk it be nice have a debate with you in an unemotional and impersonal manner bublick439
167060,"::Anyone, of course, can post an RfC. I would urge that if it is done, in the spirit of neutrality, the explanation should include a clear distinction between sourcing that an item is true (e.g., the earth is not flat) and sourcing that it is a common misconception.","nyone, of course, can post an RfC. I would urge that if it is done, in the spirit of neutrality, the explanation should include a clear distinction between sourcing that an item is true (e.g., the earth is not flat) and sourcing that it is a common misconception.","nyone, of course, can post an RfC. I would urge that if it is done, in the spirit of neutrality, the explanation should include a clear distinction between sourcing that an item is true (e.g., the earth is not flat) and sourcing that is a common misconception.",nyone of course can post an rfc i would urge that if it be do in the spirit of neutrality the explanation should include a clear distinction between source that an item be true e g the earth be not flat and source that be a common misconception


In [10]:
df[cols].sample(20).head(20)

Unnamed: 0,text,text1,text2,text3
269166,""" \n\n == A brownie for you! == \n\n {| style=""""background-color: #fdffe7; border: 1px solid #fceb92;"""" \n |style=""""vertical-align: middle; padding: 5px;"""" | \n |style=""""vertical-align: middle; padding: 3px;"""" | Hot Brownies \n |}""",""" == A brownie for you! == {| style=""""background-color: #fdffe7; border: 1px solid #fceb92;"""" |style=""""vertical-align: middle; padding: 5px;"""" | |style=""""vertical-align: middle; padding: 3px;"""" | Hot Brownies |}""",""" == A brownie for you! == {| style=""""background-color: #fdffe7; border: 1px solid #fceb92;"""" |style=""""vertical-align: middle; padding: 5px;"""" | |style=""""vertical-align: middle; padding: 3px;"""" | Hot Brownies |}""",a brownie for you style background color fdffe7 border 1px solid fceb92 style vertical align middle padding 5px style vertical align middle padding 3px hot brownies
165962,"The male shane \n What are you planning on doing here? If your intention is simply to create a nonsense page, or defame someone, I'm just going to delete it.","The male shane What are you planning on doing here? If your intention is simply to create a nonsense page, or defame someone, I'm just going to delete it.","The male shane What are you planning on doing here? If your intention is simply to create a nonse page, or defame someone, I am just going to delete it.",the male shane what be you plan on do here if your intention be simply to create a nonse page or defame someone i be just go to delete it
220283,"2005 (UTC) \n :::::::::I was going to write almost exactly the same reply, but I chickened out. ) — 03:33, 17 August","2005 (UTC):I was going to write almost exactly the same reply, but I chickened out.) - 03:33, 17 August","2005 (UTC):I was going to write almost exactly the same reply, but I chickened out.) - 03:33, 17 August",2005 utc i be go to write almost exactly the same reply but i chickene out 03 33 17 august
129350,"""\n\nYou're ignoring this """"real life"""" fact we keep trying to tell you about. We know (or at least I know) Daffy Duck is NOT a real person, Plucky Duck is NOT a real person, they have NO fathers, mothers, sisters, brothers. So because Plucky Duck is NOT a real person, Plucky (just like every single other cartoon character that says he/she/it does) can't be signed to a contract. However when he implies he can, he implies it that he has signed this contract in the """"real life"""" we keep trying to tell you about. And it is in this """"real life"""" where Plucky is Daffy's REAL father, while the one in the baby Plucky cartoon is just an """"actor"""" (hired in the """"real world"""" where this cartoon character """"signed a contract"""") """"playing"""" his father. """,""" You're ignoring this """"real life"""" fact we keep trying to tell you about. We know (or at least I know) Daffy Duck is NOT a real person, Plucky Duck is NOT a real person, they have NO fathers, mothers, sisters, brothers. So because Plucky Duck is NOT a real person, Plucky (just like every single other cartoon character that says he/she/it does) can't be signed to a contract. However when he implies he can, he implies it that he has signed this contract in the """"real life"""" we keep trying to tell you about. And it is in this """"real life"""" where Plucky is Daffy's REAL father, while the one in the baby Plucky cartoon is just an """"actor"""" (hired in the """"real world"""" where this cartoon character """"signed a contract"""") """"playing"""" his father. """,""" you are ignoring this """"real life"""" fact we keep trying to tell you about. We know (or at least I know) Daffy Duck is NOT a real person, Plucky Duck is NOT a real person, they have NO fathers, mothers, sisters, brothers. So because Plucky Duck is NOT a real person, Plucky (just like every single other cartoon character that says he/she/it does) cannot be signed to a contract. However when he implies he can, he implies it that he has signed this contract in the """"real life"""" we keep trying to tell you about. And it is in this """"real life"""" where Plucky is Daffy's REAL father, while the one in the baby Plucky cartoon is just an """"actor"""" (hired in the """"real world"""" where this cartoon character """"signed a contract"""") """"playing"""" his father. """,you be ignore this real life fact we keep try to tell you about we know or at least i know daffy duck be not a real person plucky duck be not a real person they have no father mother sister brother so because plucky duck be not a real person plucky just like every single other cartoon character that say he she it do can not be sign to a contract however when he imply he can he imply it that he have sign this contract in the real life we keep try to tell you about and it be in this real life where plucky be daffy s real father while the one in the baby plucky cartoon be just an actor hire in the real world where this cartoon character sign a contract play his father
17354,"""cursor:help;"""">Α⇔Ω]] ¦ ⇒✉)'' ""","""cursor:help;"""">]] (envelope)) "" ""","""cursor:help;"""">]] (envelope)) "" """,cursor help envelope
300824,"==Buckdharma.png== \n I have tagged Image:Buckdharma.png as . If you wish to dispute this assertion, please add {{Replaceable fair use disputed}} to the image description page and a comment explaining your reasoning to the the image talk page. – (random)","==Buckdharma.png== I have tagged Image:Buckdharma.png as . If you wish to dispute this assertion, please add {{Replaceable fair use disputed}} to the image description page and a comment explaining your reasoning to the the image talk page. - (random)","==Buckdharma.png== I have tagged Image:Buckdharma.png as . If you wish to dispute this assertion, please add {{Replaceable fair use disputed}} to the image description page and a comment explaining your reasoning to the image talk page. - (random)",buckdharma png i have tag image buckdharma png as if you wish to dispute this assertion please add replaceable fair use dispute to the image description page and a comment explain your reasoning to the image talk page random
134524,"""\n\n Please do not vandalize pages, as you did with this edit to Oakdale High School (California). If you continue to do so, you will be blocked from editing. """,""" Please do not vandalize pages, as you did with this edit to Oakdale High School (California). If you continue to do so, you will be blocked from editing. """,""" Please do not vandalize pages, as you did with this edit to Oakdale High School (California). If you continue to do so, you will be blocked from editing. """,please do not vandalize page as you do with this edit to oakdale high school california if you continue to do so you will be block from edit
45733,"ObRoy, I believe you don't understand english. You asked me not to make any changes unless I participate in the debate in the talk page. I did - and EVERYBODY on the talk page disagrees with you and believes that the article should be known as Maria Vladimirnovna of Russia. Get over it.","ObRoy, I believe you don't understand english. You asked me not to make any changes unless I participate in the debate in the talk page. I did - and EVERYBODY on the talk page disagrees with you and believes that the article should be known as Maria Vladimirnovna of Russia. Get over it.","ObRoy, I believe you do not understand english. You asked me not to make any changes unless I participate in the debate in the talk page. I did - and EVERYBODY on the talk page disagrees with you and believes that the article should be known as Maria Vladimirnovna of Russia. Get over it.",obroy i believe you do not understand english you ask i not to make any change unless i participate in the debate in the talk page i do and everybody on the talk page disagree with you and believe that the article should be know as maria vladimirnovna of russia get over it
260173,Yikes! Thanks for bringing that up. This is what I've done:,Yikes! Thanks for bringing that up. This is what I've done:,[Yikes; expression of shock and alarm]! Thanks for bring that up. This what I have done:,yikes expression of shock and alarm thank for bring that up this what i have do
53161,"Just letting you know, I have started creating a series of categories Category:Deaths at age x, which can be found in Category:Deaths by age. These categories are far from complete and are still being built. Along with the category, I started a policy page titled Wikipedia:Death by age. You may be interested in giving your input on the new categories at Wikipedia talk:Death by age.","Just letting you know, I have started creating a series of categories Category:Deaths at age x, which can be found in Category:Deaths by age. These categories are far from complete and are still being built. Along with the category, I started a policy page titled Wikipedia:Death by age. You may be interested in giving your input on the new categories at Wikipedia talk:Death by age.","Just letting you know, I have started creating a series of categories Category:Deaths at age x, which can be found in Category:Deaths by age. These categories are far from complete and are still being built. Along with the category, I started a policy page titled Wikipedia:Death by age. You may be interested in giving your input on the new categories at Wikipedia talk:Death by age.",just let you know i have start create a series of category category death at age x which can be find in category death by age these category be far from complete and be still be build along with the category i start a policy page title wikipedia death by age you may be interested in give your input on the new category at wikipedia talk death by age


In [11]:
df[cols].sample(20).head(20)

Unnamed: 0,text,text1,text2,text3
223055,I'm guessing this an album cover... what Mike do you want regoniction that you uploaded a fair use album seen on every other fucking website on the entire web?,I'm guessing this an album cover... what Mike do you want regoniction that you uploaded a fair use album seen on every other fucking website on the entire web?,I am guessing this an album cover... what Mike do you want regoniction that you uploaded a fair use album seen on every other fucking website on the entire web?,i be guess this an album cover what mike do you want regoniction that you upload a fair use album see on every other fucking website on the entire web
284708,because J.delanoy likes cock in his mouth}},because J.delanoy likes cock in his mouth}},because J.delanoy likes cock in his mouth}},because j delanoy like cock in his mouth
186815,"Discussion of Shit Cuisine \n There are many people who eat shit...such as my sister. In some countries, shit is another word for poop. \n E.g. Jane eats a lot of dog shit.","Discussion of Shit Cuisine There are many people who eat shit...such as my sister. In some countries, shit is another word for poop. E.g. Jane eats a lot of dog shit.","Discussion of Shit Cuisine There are many people who eat shit...such as my sister. In some countries, shit is another word for [poop; faeces]. E.g. Jane eats a lot of dog shit.",discussion of shit cuisine there be many people who eat shit such as my sister in some country shit be another word for poop faece e g jane eat a lot of dog shit
194671,Who keeps on removing the fact that he's a climate denier? Stupid removing it: he is one of the most prominent climate deniers out there. (),Who keeps on removing the fact that he's a climate denier? Stupid removing it: he is one of the most prominent climate deniers out there. (),Who keeps on removing the fact that he is a climate denier? Stupid removing it: he is one of the most prominent climate deniers out there. (),who keep on remove the fact that he be a climate denier stupid remove it he be one of the most prominent climate denier out there
281223,I am awaiting a response from FT2.,I am awaiting a response from FT2.,I am awaiting a response from FT2.,i be await a response from ft2
72892,I am telling you for a last time: Do not get my words from the context. And do not make silly yourself that the name is somewhere written as it is now. Thant name on the Serbian Wiki is written on the Serbian alphabet. Is it clear to you now. And also on the bulgarian Wikipedia is said that he is Macedonian. Is it clear to you now. And also on the bulgarian Wikipedia is said that he is Macedonian. P.S. Give me sources or proofs that he was writing on Bulgarian,I am telling you for a last time: Do not get my words from the context. And do not make silly yourself that the name is somewhere written as it is now. Thant name on the Serbian Wiki is written on the Serbian alphabet. Is it clear to you now. And also on the bulgarian Wikipedia is said that he is Macedonian. Is it clear to you now. And also on the bulgarian Wikipedia is said that he is Macedonian. P.S. Give me sources or proofs that he was writing on Bulgarian,I am telling you for a last time: Do not get my words from the context. And do not make silly yourself that the name is somewhere written as it is now. Thant name on the Serbian [Wiki; a website or database developed collaboratively by an online community] is written on the Serbian alphabet. Is it clear to you now. And also on the bulgarian Wikipedia is said that he is Macedonian. P.S. Give me sources or proofs that he was writing on Bulgarian,i be tell you for a last time do not get my word from the context and do not make silly yourself that the name be somewhere write as it be now thant name on the serbian wiki a website or database develop collaboratively by an online community be write on the serbian alphabet be it clear to you now and also on the bulgarian wikipedia be say that he be macedonian p s give i source or proof that he be write on bulgarian
144980,"Don't post on my profile page, you fucking cunt. (Talk)","Don't post on my profile page, you fucking cunt. (Talk)","do not post on my profile page, you fucking [cunt; profanity for a woman's vagina]. (Talk)",do not post on my profile page you fucking cunt profanity for a woman s vagina talk
87607,"Krenair,\n\nThis is untrue! You, as an entity, have the power to discover truth and influence decision (others). If you feel that the Bwilkins' action was justified, then please explain your reasoning, so that you may improve me (odd choice of words, maybe, but that's how it is). If you feel, as I, that the article contained real information (RE: The citations and further improvement of facts), then I encourage you to mention this to Bwilkins for the same reason we are speaking.\n\nSam","Krenair, This is untrue! You, as an entity, have the power to discover truth and influence decision (others). If you feel that the Bwilkins' action was justified, then please explain your reasoning, so that you may improve me (odd choice of words, maybe, but that's how it is). If you feel, as I, that the article contained real information (RE: The citations and further improvement of facts), then I encourage you to mention this to Bwilkins for the same reason we are speaking. Sam","Krenair, This untrue! You, as an entity, have the power to discover truth and influence decision (others). If you feel that the Bwilkins' action was justified, then please explain your reasoning, so that you may improve me (odd choice of words, maybe, but that is how it is). If you feel, as I, that the article contained real information (RE: The citations and further improvement of facts), then I encourage you to mention this to Bwilkins for the same reason we are speaking. Sam",krenair this untrue you as an entity have the power to discover truth and influence decision other if you feel that the bwilkins action be justify then please explain your reasoning so that you may improve i odd choice of word maybe but that be how it be if you feel as i that the article contain real information re the citation and further improvement of fact then i encourage you to mention this to bwilkins for the same reason we be speak sam
274547,""" \n\n :::I never said Syriac was an extinct language; don't put words in my mouth. You still have yet to provide any reliable sources that state that Syriac is still spoken in Mount Lebanon. Your assurances mean nothing to mean, and the sources you've cited are garbage. Likewise, calling another editor """"dishonest"""" and """"stupid"""" is a violation of Wikipedia's code of conduct. Incivility is likely to get you banned from Wikipedia. You are correct, however, that this discussion is quickly going no where. It's become evident that your intent is to use this article as a soapbox. I'll looking into filing an RfC on the issue in the coming days. ← [] """,""":I never said Syriac was an extinct language; don't put words in my mouth. You still have yet to provide any reliable sources that state that Syriac is still spoken in Mount Lebanon. Your assurances mean nothing to mean, and the sources you've cited are garbage. Likewise, calling another editor """"dishonest"""" and """"stupid"""" is a violation of Wikipedia's code of conduct. Incivility is likely to get you banned from Wikipedia. You are correct, however, that this discussion is quickly going no where. It's become evident that your intent is to use this article as a soapbox. I'll looking into filing an RfC on the issue in the coming days. [] """,""":I never said Syriac was an extinct language; do not put words in my mouth. You still have yet to provide any reliable sources that state that Syriac is still spoken in Mount Lebanon. Your assurances mean nothing to mean, and the sources you have cited are garbage. Likewise, calling another editor """"dishonest"""" and """"stupid"""" is a violation of Wikipedia's code of conduct. Incivility is likely to get you banned from Wikipedia. You are correct, however, that this discussion is quickly going no where. it is become evident that your intent is to use this article as a soapbox. I will looking into filing an RfC on the issue in the coming days. [] """,i never say syriac be an extinct language do not put word in my mouth you still have yet to provide any reliable source that state that syriac be still speak in mount lebanon your assurance mean nothing to mean and the source you have cite be garbage likewise call another editor dishonest and stupid be a violation of wikipedia s code of conduct incivility be likely to get you ban from wikipedia you be correct however that this discussion be quickly go no where it be become evident that your intent be to use this article as a soapbox i will look into file an rfc on the issue in the come day
189257,::: How is wikiapedia commercial?,: How is wikiapedia commercial?,: How is wikiapedia commercial?,how be wikiapedia commercial


# Review data

In [12]:
col = "worker"
df[col] = np.random.randint(1, 50, size=len(df), dtype=np.int8)
cols = ["worker", "text", "text1", "text2", "text3"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 264106 entries, 0 to 312734
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   worker  264106 non-null  int8  
 1   text    264106 non-null  object
 2   text1   264106 non-null  object
 3   text2   264106 non-null  object
 4   text3   264106 non-null  object
dtypes: int8(1), object(4)
memory usage: 10.3+ MB


In [13]:
%%time
df[cols].to_parquet("output/pre_js18.parquet", index=False)

Wall time: 1.26 s
