In [1]:
import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 63 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


def pre3(row) -> str:
    return mylib.pre3(row["text2"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)
col = "text3"
print(col)
df[col] = df.progress_apply(pre3, axis=1)

text1


100%|██████████████████████████████████████████| 5710/5710 [00:10<00:00, 555.64it/s]


text2


100%|███████████████████████████████████████████| 5710/5710 [04:06<00:00, 23.13it/s]


text3


100%|██████████████████████████████████████████| 5710/5710 [00:53<00:00, 107.23it/s]


In [8]:
cols = ["bws", "text", "text1", "text2", "text3"]
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2,text3
3326,-0.604,I see you too are a man of culture;),I see you too are a man of culture [Wink or smirk],I see you too are a man of culture [Wink or smirk],i see you too be a man of culture wink or smirk
693,-0.522,"Not completely open, but a lot more open than they are right now.","Not completely open, but a lot more open than they are right now.","Not completely open, but a lot more open than they are right now.",not completely open but a lot more open than they be right now
4898,-0.521,"Wait, so how did you get here then?","Wait, so how did you get here then?","Wait, so how did you get here then?",wait so how do you get here then
1763,-0.417,Took a call from her ex.,Took a call from her ex.,Took a call from her ex.,take a call from her ex
4745,-0.326,"this one is a beast. Runs for three days straight, then you just charge it for about one hour at lunch time and it's 100% again, ready for another three days. Absolutely love it.\n\nNot that great of a camera though, but I couldn't care less","this one is a beast. Runs for three days straight, then you just charge it for about one hour at lunch time and it's 100% again, ready for another three days. Absolutely love it. Not that great of a camera though, but I couldn't care less","this one is a beast. Runs for three days straight, then you just charge it for about one hour at lunch time and it is 100% again, ready for another three days. Absolutely love it. Not that great of a camera though, but I could not care less",this one be a beast runs for three day straight then you just charge it for about one hour at lunch time and it be 100 again ready for another three day absolutely love it not that great of a camera though but i could not care less
5280,-0.292,I bumped into Martin O'Malley once. At least I think I did.,I bumped into Martin O'Malley once. At least I think I did.,I bumped into Martin O'Malley once. At least I think I did.,i bump into martin o malley once at least i think i do
3222,-0.208,"Additionally, adjacent can mean on the flat side of a square or a corner. The highest number that could be seen is ""8"", that would mean that that square is completely surrounded by mines.\n\nEdit: It is the most basic deduction game, but, in my experience, there is a ton of luck involved too.","Additionally, adjacent can mean on the flat side of a square or a corner. The highest number that could be seen is ""8,"" that would mean that that square is completely surrounded by mines. Edit: It is the most basic deduction game, but, in my experience, there is a ton of luck involved too.","Additionally, adjacent can mean on the flat side of a square or a corner. The highest number that could be seen is ""8,"" that would mean that square is completely surrounded by mines. Edit: It is the most basic deduction game, but, in my experience, there is a ton of luck involved too.",additionally adjacent can mean on the flat side of a square or a corner the high number that could be see be 8 that would mean that square be completely surround by mine edit it be the most basic deduction game but in my experience there be a ton of luck involve too
5297,-0.167,The exact reason she should never get it.,The exact reason she should never get it.,The exact reason she should never get it.,the exact reason she should never get it
4974,-0.104,"It seems highly illogical to believe a. Water retains a “memory” of its previous solvent, and b. That some how the smaller the amount of the solvent the greater the potency, which inversely exponentially increases.","It seems highly illogical to believe a. Water retains a ""memory"" of its previous solvent, and b. That some how the smaller the amount of the solvent the greater the potency, which inversely exponentially increases.","It seems highly illogical to believe a. Water retains a ""memory"" of its previous solvent, and b. That some how the smaller the amount of the solvent the greater the potency, which inversely exponentially increases.",it seem highly illogical to believe a water retain a memory of its previous solvent and b that some how the small the amount of the solvent the great the potency which inversely exponentially increase
2858,-0.062,The AP called CA for Clinton the day before primary voting... It was very disheartening.,The AP called CA for Clinton the day before primary voting... It was very disheartening.,The AP called CA for Clinton the day before primary voting... It was very disheartening.,the ap call ca for clinton the day before primary voting it be very disheartening


In [9]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2,text3
2636,-0.34,Xiaomi Hybrid has a nice and good sounding v-shape to it. I like mine a lot for electronic music and anything that comes with a punchy bass.\n\nHaven't heard anything good about the Youth edition.,Xiaomi Hybrid has a nice and good sounding v-shape to it. I like mine a lot for electronic music and anything that comes with a punchy bass. Haven't heard anything good about the Youth edition.,"Xiaomi Hybrid has a nice and good sounding v-shape to it. I like mine a lot for electronic music and anything that comes with a [punchy; having punch; forceful, spirited] bass. have not heard anything good about the Youth edition.",xiaomi hybrid have a nice and good sounding v shape to it i like mine a lot for electronic music and anything that come with a punchy having punch forceful spirited bass have not hear anything good about the youth edition
3844,-0.333,"Yours is the Girl Scout Pledge. What u/hagnonbg wrote was the Girl Scout Law. 2 different things, both correct.\n\nand the something something is ""to help people at all times""","Yours is the Girl Scout Pledge. What u/hagnonbg wrote was the Girl Scout Law. 2 different things, both correct. and the something something is ""to help people at all times""","Yours is the Girl Scout Pledge. What u/hagnonbg wrote was the Girl Scout Law. 2 different things, both correct. and the something is ""to help people at all times""",yours be the girl scout pledge what u hagnonbg write be the girl scout law 2 different thing both correct and the something be to help people at all time
1096,-0.312,"Humans do inherit traits, but they do NOT inherit learned behaviours. They may mimic, but that’s entirely different and can be unlearned as well as it was learned.","Humans do inherit traits, but they do NOT inherit learned behaviours. They may mimic, but that's entirely different and can be unlearned as well as it was learned.","Humans do inherit traits, but they do NOT inherit learned behaviours. They may mimic, but that is entirely different and can be unlearned as well as it was learned.",human do inherit trait but they do not inherit learn behaviour they may mimic but that be entirely different and can be unlearn as well as it be learn
3457,-0.191,"You said you recently installed geopandas. Will you share how did you install it?\n\nI tried conda install -n py36 -c conda-forge install geopandas.\n\nIt installs good, but when i import geopandas it gives ImportError relating to fiona.\n\n dlopen some .so file, ... referenced from, ... library not found.\n\n\nDid you get any library linking error, if so how did you fixed it?\n\nMy computer:\nMacos mojave\nAnaconda \nPy36 is anaconda environment i created.\n\nThanks.","You said you recently installed geopandas. Will you share how did you install it? I tried conda install -n py36 -c conda-forge install geopandas. It installs good, but when i import geopandas it gives ImportError relating to fiona. dlopen some .so file, ... referenced from, ... library not found. Did you get any library linking error, if so how did you fixed it? My computer: Macos mojave Anaconda Py36 is anaconda environment i created. Thanks.","You said you recently installed geopandas. Will you share how did you install it? I tried conda install -n py36 -c conda-forge install geopandas. It installs good, but when i import geopandas it gives ImportError relating to fiona. dlopen some .so file, ... referenced from, ... library not found. Did you get any library linking error, if so how did you fixed it? My computer: Macos mojave Anaconda Py36 is anaconda environment i created. Thanks.",you say you recently instal geopandas will you share how do you install it i try conda install n py36 c conda forge install geopandas it install good but when i import geopandas it give importerror relate to fiona dlopen some so file reference from library not find do you get any library link error if so how do you fix it my computer macos mojave anaconda py36 be anaconda environment i create thank
906,-0.146,"I think the key word you used was eventually. Eventually the marshmallow will melt. I will be long done with my hot chocolate by the time it even gets softened. Maybe as an after drink treat but then you missed out on the cream and chocolate mix. Plus whipped cream has basically 3 ingredients if you make it at home, cream, powdered sugar and a bit of vanilla. Whereas marshmallow has about half a dozen or more. some, once you learn where it comes from, you might not want in your drink.","I think the key word you used was eventually. Eventually the marshmallow will melt. I will be long done with my hot chocolate by the time it even gets softened. Maybe as an after drink treat but then you missed out on the cream and chocolate mix. Plus whipped cream has basically 3 ingredients if you make it at home, cream, powdered sugar and a bit of vanilla. Whereas marshmallow has about half a dozen or more. some, once you learn where it comes from, you might not want in your drink.","I think the key word you used was eventually. Eventually the marshmallow will melt. I will be long done with my hot chocolate by the time it even gets softened. Maybe as an after drink treat but then you missed out on the cream and chocolate mix. Plus whipped cream has basically 3 ingredients if you make it at home, cream, powdered sugar and a bit of vanilla. Whereas marshmallow has about half a dozen or more. some, once you learn where it comes from, you might not want in your drink.",i think the key word you use be eventually eventually the marshmallow will melt i will be long do with my hot chocolate by the time it even gets soften maybe as an after drink treat but then you miss out on the cream and chocolate mix plus whip cream have basically 3 ingredient if you make it at home cream powdered sugar and a bit of vanilla whereas marshmallow have about half a dozen or more some once you learn where it come from you might not want in your drink
353,-0.104,"The biggest problem with American politics is that our politicians are all bought. ""Government is the entertainment division of the military industrial complex."" What this means is that the ""liberals vs conservatives"" stuff is a farce. It's a show they put on to make us think our opinions matter when all that really matters is money.","The biggest problem with American politics is that our politicians are all bought. ""Government is the entertainment division of the military industrial complex."" What this means is that the ""liberals vs conservatives"" stuff is a farce. It's a show they put on to make us think our opinions matter when all that really matters is money.","The biggest problem with American politics is that our politicians are all bought. ""Government is the entertainment division of the military industrial complex."" What this means is that the ""liberals vs conservatives"" stuff is a farce. it is a show they put on to make us think our opinions matter when all that really matters is money.",the big problem with american politic be that our politician be all buy government be the entertainment division of the military industrial complex what this mean be that the liberal vs conservatives stuff be a farce it be a show they put on to make we think our opinion matter when all that really matter be money
1259,-0.083,"Ironically healthcare and schooling are the least capitalist industries in the country, its why theres problems. Theres a laundry list of anticompetitive rules in healthcare. Companies cant even compete across state lines...ofc prices are going to be insanely high. If salaries for teachers were set by the market instead of the government maybe we wouldnt have such shitty schools.","Ironically healthcare and schooling are the least capitalist industries in the country, its why theres problems. Theres a laundry list of anticompetitive rules in healthcare. Companies cant even compete across state lines...ofc prices are going to be insanely high. If salaries for teachers were set by the market instead of the government maybe we wouldnt have such shitty schools.","Ironically healthcare and schooling are the least capitalist industries in the country, its why theres problems. Theres a laundry list of anticompetitive rules in healthcare. Companies cant even compete across state lines...[ofc; of course] prices are going to be insanely high. If salaries for teachers were set by the market instead of the government maybe wouldnt have such [shitty; shit; an exclamation of disgust, anger, or annoyance] schools.",ironically healthcare and schooling be the least capitalist industry in the country its why there s problem there s a laundry list of anticompetitive rule in healthcare companies ca nt even compete across state line ofc of course price be go to be insanely high if salary for teacher be set by the market instead of the government maybe would nt have such shitty shit an exclamation of disgust anger or annoyance school
1236,-0.021,"When you create a new fat cell, your body keeps it for life. Therefore, it makes it even easier to gain weight after losing the weight from before (childhood in my case). So for people like me, who do work hard even just to maintain my weight after losing some, the question is... was it a choice for me as a kid to create all those new fat cells. Because I’m stuck with them now... they just lay there waiting to be inflated by any calorie I eat and don’t burn. It sucks man it’s a lot harder than people think.","When you create a new fat cell, your body keeps it for life. Therefore, it makes it even easier to gain weight after losing the weight from before (childhood in my case). So for people like me, who do work hard even just to maintain my weight after losing some, the question is... was it a choice for me as a kid to create all those new fat cells. Because I'm stuck with them now... they just lay there waiting to be inflated by any calorie I eat and don't burn. It sucks man it's a lot harder than people think.","When you create a new fat cell, your body keeps it for life. Therefore, it makes it even easier to gain weight after losing the weight from before (childhood in my case). So for people like me, who do work hard even just to maintain my weight after losing some, the question is... was it a choice for me as a kid to create all those new fat cells. Because I am stuck with them now... they just lay there waiting to be inflated by any calorie I eat and do not burn. It [sucks; be very bad or unpleasant] man it is a lot harder than people think.",when you create a new fat cell your body keep it for life therefore it make it even easy to gain weight after lose the weight from before childhood in my case so for people like i who do work hard even just to maintain my weight after lose some the question be be it a choice for i as a kid to create all those new fat cell because i be stuck with they now they just lie there wait to be inflate by any calorie i eat and do not burn it suck be very bad or unpleasant man it be a lot hard than people think
4395,0.0,They won’t die. They are vaccinated. They just don’t vaccinate their kids,They won't die. They are vaccinated. They just don't vaccinate their kids,They will not die. They are vaccinated. They just do not vaccinate their kids,they will not die they be vaccinate they just do not vaccinate their kid
4031,0.0,It's threefold:\n\n* Lower frequency of sex (across all age groups)\n* Later (and later) time of first sexual experiences\n* Increased contraception use \n,It's threefold: * Lower frequency of sex (across all age groups) * Later (and later) time of first sexual experiences * Increased contraception use,it is threefold: * Lower frequency of sex (across all age groups) * Later (and later) time of first sexual experiences * Increased contraception use,it be threefold low frequency of sex across all age group later and later time of first sexual experience increase contraception use


In [10]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2,text3
4842,-0.457,Exposing the number of bots would crash their stock,Exposing the number of bots would crash their stock,Exposing the number of bots would crash their stock,expose the number of bot would crash their stock
121,-0.438,"Still not 2.5 inches. 1 inch is about as long as their canine teeth get, and the older they are, the shorter the teeth.","Still not 2.5 inches. 1 inch is about as long as their canine teeth get, and the older they are, the shorter the teeth.","Still not 2.5 inches. 1 inch is about as long as their canine teeth get, and the older they are, the shorter the teeth.",still not 2 5 inch 1 inch be about as long as their canine tooth get and the old they be the short the tooth
4204,-0.396,For anyone that didn’t bother actually reading the article this is a satire bill.,For anyone that didn't bother actually reading the article this is a satire bill.,For anyone that did not bother actually reading the article this a satire bill.,for anyone that do not bother actually read the article this a satire bill
1220,-0.34,I agree. Some locations just have a ton of relatively healthy food to eat for cheap. Some places are literally food deserts where you need to travel a bit to even buy produce. Theres a big gradient between them that make it harder or easier to eat healthy.,I agree. Some locations just have a ton of relatively healthy food to eat for cheap. Some places are literally food deserts where you need to travel a bit to even buy produce. Theres a big gradient between them that make it harder or easier to eat healthy.,I agree. Some locations just have a ton of relatively healthy food to eat for cheap. Some places are literally food deserts where you need to travel a bit to even buy produce. Theres a big gradient between them that make it harder or easier to eat healthy.,i agree some location just have a ton of relatively healthy food to eat for cheap some place be literally food desert where you need to travel a bit to even buy produce there s a big gradient between they that make it hard or easy to eat healthy
1112,-0.333,"Ok, then I would argue I do not believe any current evidence of the contrary to my assertion.\n\nThere is no evidence that I am aware of that tells me otherwise.","Ok, then I would argue I do not believe any current evidence of the contrary to my assertion. There is no evidence that I am aware of that tells me otherwise.","Ok, then I would argue I do not believe any current evidence of the contrary to my assertion. There is no evidence that I am aware of that tells me otherwise.",ok then i would argue i do not believe any current evidence of the contrary to my assertion there be no evidence that i be aware of that tell i otherwise
148,-0.255,But you're not weakening them with the ban. You are strengthening them by validating their persecution narrative. If your house is on fire you have a choice. Use a water to douse the fire or try to protect yourself by throwing logs at the flames. We don't get safer by adding fuel to the fire. \n\n,But you're not weakening them with the ban. You are strengthening them by validating their persecution narrative. If your house is on fire you have a choice. Use a water to douse the fire or try to protect yourself by throwing logs at the flames. We don't get safer by adding fuel to the fire.,But you are not weakening them with the ban. You are strengthening them by validating their persecution narrative. If your house is on fire you have a choice. Use a water to douse the fire or try to protect yourself by throwing logs at the flames. We do not get safer by adding fuel to the fire.,but you be not weaken they with the ban you be strengthen they by validate their persecution narrative if your house be on fire you have a choice use a water to douse the fire or try to protect yourself by throw log at the flame we do not get safe by add fuel to the fire
366,-0.229,"To be honest, fostering national or international amateur sports competition seemed a bit odd to me as well. Perhaps that should be re-evaluated as well, but that seems kinda tangential to this CMV. \n\nIn order to qualify for 501c3 classification on the basis of education, the institution must ""provide instruction or train individuals to improve their capabilities, or provide public education on items that are beneficial to the community."" I would welcome churches to submit their requests for that status, but I don't think they would qualify.","To be honest, fostering national or international amateur sports competition seemed a bit odd to me as well. Perhaps that should be re-evaluated as well, but that seems kinda tangential to this CMV. In order to qualify for 501c3 classification on the basis of education, the institution must ""provide instruction or train individuals to improve their capabilities, or provide public education on items that are beneficial to the community."" I would welcome churches to submit their requests for that status, but I don't think they would qualify.","To be honest, fostering national or international amateur sports competition seemed a bit odd to me as well. Perhaps that should be re-evaluated as well, but that seems kind of tangential to this CMV. In order to qualify for 501c3 classification the basis of education, the institution must ""provide instruction or train individuals to improve their capabilities, or provide public education items that are beneficial to the community."" I would welcome churches to submit their requests for that status, but I do not think they would qualify.",to be honest foster national or international amateur sport competition seem a bit odd to i as well perhaps that should be re evaluate as well but that seem kind of tangential to this cmv in order to qualify for 501c3 classification the basis of education the institution must provide instruction or train individual to improve their capability or provide public education item that be beneficial to the community i would welcome church to submit their request for that status but i do not think they would qualify
2569,-0.167,"Because what happens in one part of the world can often affect things in other parts.\n\nSee, for example;\n\n* the current Syria migrant crisis,\n* the US sub-prime mortgage crisis,\n* the Second World War.","Because what happens in one part of the world can often affect things in other parts. See, for example; * the current Syria migrant crisis, * the US sub-prime mortgage crisis, * the Second World War.","Because what happens in one part of the world can often affect things in other parts. See, for example; * the current Syria migrant crisis, * the US sub-prime mortgage crisis, * the Second World War.",because what happen in one part of the world can often affect thing in other part see for example the current syria migrant crisis the us sub prime mortgage crisis the second world war
2072,-0.152,"I've spent a lot of time thinking about the idea of ""greenwashing"", so much that I've got a useless MBA degree in sustainable business. \n\nI think you should be more forgiving to companies who do ""plaster"" this information out there. It's a marketable value that would presumably create more demand and increase revenue...which then encourages more reinvestment into cause-driven efforts. \n\nYeah, ideally, we would want the donation to come from a sincere place, but in the end, what difference does it make if a charity receives $1,000,000 from an anonymous donor or if they receive $1,000,000 from someone who hopes to share that information? Why not go for the win-win option?","I've spent a lot of time thinking about the idea of ""greenwashing,"" so much that I've got a useless MBA degree in sustainable business. I think you should be more forgiving to companies who do ""plaster"" this information out there. It's a marketable value that would presumably create more demand and increase revenue...which then encourages more reinvestment into cause-driven efforts. Yeah, ideally, we would want the donation to come from a sincere place, but in the end, what difference does it make if a charity receives $1,000,000 from an anonymous donor or if they receive $1,000,000 from someone who hopes to share that information? Why not go for the win-win option?","I have spent a lot of time thinking about the idea of ""greenwashing,"" so much that I have got a useless MBA degree in sustainable business. I think you should be more forgiving to companies who do ""plaster"" this information out there. it is a marketable value that would presumably create more demand increase revenue...which then encourages more reinvestment into cause-driven efforts. Yeah, ideally, we would want the donation to come from a sincere place, but in the end, what difference does it make if a charity receives $1,000,000 from anonymous donor if they receive $1,000,000 from someone who hopes to share that information? Why not go for the win-win option?",i have spend a lot of time think about the idea of greenwashe so much that i have get a useless mba degree in sustainable business i think you should be more forgiving to company who do plaster this information out there it be a marketable value that would presumably create more demand increase revenue which then encourage more reinvestment into cause drive effort yeah ideally we would want the donation to come from a sincere place but in the end what difference do it make if a charity receive 1 000 000 from anonymous donor if they receive 1 000 000 from someone who hope to share that information why not go for the win win option
1326,-0.083,"The goal is to stop the behavior in the moment to create a safe situation (if the child is doing something that puts them in imminent danger, the sole focus should be on stopping that behavior.) Beyond that, the focus should be on creating a well adjusted adult. \n\nSpanking correctly is spanking that is warranted (without a doubt), understood, and after other methods have been exhausted.","The goal is to stop the behavior in the moment to create a safe situation (if the child is doing something that puts them in imminent danger, the sole focus should be on stopping that behavior.) Beyond that, the focus should be on creating a well adjusted adult. Spanking correctly is spanking that is warranted (without a doubt), understood, and after other methods have been exhausted.","The goal is to stop the behavior in the moment to create a safe situation (if the child is doing something that puts them in imminent danger, the sole focus should be on stopping that behavior.) Beyond that, the focus should be on creating a well adjusted adult. Spanking correctly is spanking that is warranted (without a doubt), understood, and after other methods have been exhausted.",the goal be to stop the behavior in the moment to create a safe situation if the child be do something that put they in imminent danger the sole focus should be on stop that behavior beyond that the focus should be on create a well adjust adult spanking correctly be spank that be warrant without a doubt understand and after other method have be exhaust


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["bws", "worker", "text", "text1", "text2", "text3"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
 6   text3   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(4)
memory usage: 228.7+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

Wall time: 53 ms
