In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 66 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [6]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Stage 1: Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)

text1


100%|██████████████████████████████████████████| 5710/5710 [00:10<00:00, 543.58it/s]


text2


100%|███████████████████████████████████████████| 5710/5710 [01:00<00:00, 95.15it/s]


In [8]:
cols = ["bws", "text", "text1", "text2"]
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
4241,-0.417,Is this for the preorder jersey that is showing as only available in XS? I can’t find the jersey for sale anywhere in a Men’s Large.,Is this for the preorder jersey that is showing as only available in XS? I can't find the jersey for sale anywhere in a Men's Large.,Is this for the preorder jersey that is showing as only available in XS? I cannot find the jersey for sale anywhere in a Men's Large.
1756,-0.312,"She started describing her days in college, and how when she got lonely, she would climb up into the trees to sing to the squirrels.","She started describing her days in college, and how when she got lonely, she would climb up into the trees to sing to the squirrels.","She started describing her days in college, and how when she got lonely, she would climb up into the trees to sing to the squirrels."
2277,-0.25,"I set my first alarm to 30 minutes before I need to wake up, and I take my Ritalin. I dont even turn on the lights, I just grab it, swallow it, back to sleep.\nWhen the alarm to wake up comes on, the ritalin is kicking in and it's much easier. Then I sit up and I loudly proclaim ""yeeeah baby!"",like Ronnie Coleman about to do a set, cause a guy once told me to do that when I woke up to just boost my energy and morale and it works.","I set my first alarm to 30 minutes before I need to wake up, and I take my Ritalin. I dont even turn on the lights, I just grab it, swallow it, back to sleep. When the alarm to wake up comes on, the ritalin is kicking in and it's much easier. Then I sit up and I loudly proclaim ""yeeeah baby!,""like Ronnie Coleman about to do a set, cause a guy once told me to do that when I woke up to just boost my energy and morale and it works.","I set my first alarm to 30 minutes before I need to wake up, and I take my Ritalin. I dont even turn on the lights, I just grab it, swallow it, back to sleep. When the alarm to wake up comes on, the ritalin is kicking in and it is much easier. Then I sit up and I loudly proclaim ""yeeeah baby!,""like Ronnie Coleman about to do a set, cause a guy once told me to do that when I woke up to just boost my energy and morale and it works."
3488,-0.25,"If ypu want to show your woman appreciation, make a nice dinner date with her. In your place, 100% home-made dishes, your specialties, and basically pamper her home style. Enjoy some wine and cuddle. Just let her know you appreciate her. Just return the favor by being sweet. I'm sure this will melt your girl right off of her feet. A hard-day's work is worth it if she sees you being sweet to her. Anyways, good luck to your relation ship and congratulations for finding each other! Seems like you two already know you positions if uou kno' what I mean.","If ypu want to show your woman appreciation, make a nice dinner date with her. In your place, 100% home-made dishes, your specialties, and basically pamper her home style. Enjoy some wine and cuddle. Just let her know you appreciate her. Just return the favor by being sweet. I'm sure this will melt your girl right off of her feet. A hard-day's work is worth it if she sees you being sweet to her. Anyways, good luck to your relation ship and congratulations for finding each other! Seems like you two already know you positions if uou kno' what I mean.","If ypu want to show your woman appreciation, make a nice dinner date with her. In your place, 100% home-made dishes, your specialties, and basically pamper home style. Enjoy some wine and cuddle. Just let her know you appreciate her. Just return the favor by being sweet. I am sure this will melt your girl right off of her feet. A hard-day's work is worth it if she sees you being sweet to her. Anyways, good luck to your relation ship and congratulations for finding each other! Seems like you two already know you positions if uou kno' what I mean."
3118,-0.229,Maybe substitute poisson for the chickens and work with a baby camel and lamb.Ovbiously the serves 80-100 is going to be less but fun times ahead with only a modest industrial size oven.,Maybe substitute poisson for the chickens and work with a baby camel and lamb.Ovbiously the serves 80-100 is going to be less but fun times ahead with only a modest industrial size oven.,Maybe substitute poisson for the chickens and work with a baby camel and lamb.Ovbiously the serves 80-100 is going to be less but fun times ahead with only a modest industrial size oven.
3256,-0.208,"USBN is tomorrow boiss.\n\nWish everyone who run it luck.\n\nedit : ""luck"" = nyontek xD","USBN is tomorrow boiss. Wish everyone who run it luck. edit: ""luck"" = nyontek xD","USBN is tomorrow boiss. Wish everyone who run it luck. edit: ""luck"" = nyontek xD"
1406,-0.208,How do you judge what is considered emotional or physical abuse?,How do you judge what is considered emotional or physical abuse?,How do you judge what is considered emotional or physical abuse?
1210,-0.188,"Really it’s not that big an issue, but if there happens to be a line, men can use the restroom much faster than women, and it’s kinda an inconvenience to wait 2-3 minutes per person for 20 seconds of bathroom time.\n\nThis is assuming a line forms at a single stall bathroom though... and that barely ever happens","Really it's not that big an issue, but if there happens to be a line, men can use the restroom much faster than women, and it's kinda an inconvenience to wait 2-3 minutes per person for 20 seconds of bathroom time. This is assuming a line forms at a single stall bathroom though... and that barely ever happens","Really it is not that big an issue, but if there happens to be a line, men can use the restroom much faster than women, and it is kind of an inconvenience to wait 2-3 minutes person for 20 seconds of bathroom time. This assuming a line forms at a single stall bathroom though... and that barely ever happens"
5153,-0.188,This is the best form of teen rebellion...but so sad that is not considered teen rebellion.,This is the best form of teen rebellion...but so sad that is not considered teen rebellion.,This the best form of teen rebellion...but so sad that is not considered teen rebellion.
1156,-0.146,Once went to a place that had a gender neutral bathroom...\n\nand a men’s bathroom beside it?\n🤔,Once went to a place that had a gender neutral bathroom... and a men's bathroom beside it? (thinking face),Once went to a place that had a gender neutral bathroom... and a men's bathroom beside it? (thinking face)


In [9]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
1508,-0.729,A good set of kitchen knives - Japanese or German steel,A good set of kitchen knives - Japanese or German steel,A good set of kitchen knives - Japanese or German steel
1893,-0.479,Isn’t this the plot of every season of The Bachelor?,Isn't this the plot of every season of The Bachelor?,is not this the plot of every season of The Bachelor?
4602,-0.438,"wait, you guys are getting paid?","wait, you guys are getting paid?","wait, you guys are getting paid?"
875,-0.292,"Marshmallows get more disgusting when you start thinking about what's in them...boiled animal cartilage isn't exactly appetizing.\n\nAlso, presenting the false dichotomy between marshmallows and whipped cream completely disregards the best thing to add to hot chocolate...peppermint schnapps!","Marshmallows get more disgusting when you start thinking about what's in them...boiled animal cartilage isn't exactly appetizing. Also, presenting the false dichotomy between marshmallows and whipped cream completely disregards the best thing to add to hot chocolate...peppermint schnapps!","Marshmallows get more disgusting when you start thinking about what is in them...boiled animal cartilage is not exactly appetizing. Also, presenting the false dichotomy between marshmallows and whipped cream completely disregards the best thing to add to hot chocolate...peppermint schnapps!"
836,-0.292,"I know that's what they're referring to, but I think it's more complicated than that. I think these are legitimate thought experiment questions that shows it's more complicated, so I'm trying to get to the heart of the issue. If it's a mutually agreed upon exchange of value where everyone is being honest, I personally think it's moral, so I'm wondering where the line is so I can understand their point of view.","I know that's what they're referring to, but I think it's more complicated than that. I think these are legitimate thought experiment questions that shows it's more complicated, so I'm trying to get to the heart of the issue. If it's a mutually agreed upon exchange of value where everyone is being honest, I personally think it's moral, so I'm wondering where the line is so I can understand their point of view.","I know that is what they are referring to, but I think it is more complicated than that. I think these are legitimate thought experiment questions that shows it is more complicated, so I am trying to get to the heart of the issue. If it is a mutually agreed upon exchange of value where everyone is being honest, I personally think it is moral, so I am wondering where the line is so I can understand their point of view."
893,-0.292,"> You'll notice none of my reasons related to taste.\n\nIn that case, salt is a superior topping to both whipped cream and marshmallows:\n\n* Salt is shelf stable and effectively never expires. In fact, it is anti-microbial.\n* Salt is dense and uses space efficiently.\n* Salt is cheap and widely available.\n* Salt can be used for many other things such as making icy roads safer to drive on, food preservation, etc.","> You'll notice none of my reasons related to taste. In that case, salt is a superior topping to both whipped cream and marshmallows: * Salt is shelf stable and effectively never expires. In fact, it is anti-microbial. * Salt is dense and uses space efficiently. * Salt is cheap and widely available. * Salt can be used for many other things such as making icy roads safer to drive on, food preservation, etc.","> you will notice none of my reasons related to taste. In that case, salt is a superior topping to both whipped cream and marshmallows: * Salt is shelf stable and effectively never expires. In fact, it is anti-microbial. * Salt is dense and uses space efficiently. * Salt is cheap and widely available. * Salt can be used for many other things such as making icy roads safer to drive on, food preservation, etc."
386,-0.234,"The central idea is that the power to tax is the power to destroy. Theoretically, if the government wanted to destroy a disfavored religion (or it decided that all religions were disfavored), it could contrive a reason to impose really burdensome taxes on it to the point where it can no longer function (priests need to eat, and churches need to keep the lights on). This, of course, would violate the free exercise and establishment clauses of the First Amendment. So the government has decided, in an abundance of caution, to just not tax any religion for any purpose in the interest of maintaining the separation between church and state and honoring the First Amendment.","The central idea is that the power to tax is the power to destroy. Theoretically, if the government wanted to destroy a disfavored religion (or it decided that all religions were disfavored), it could contrive a reason to impose really burdensome taxes on it to the point where it can no longer function (priests need to eat, and churches need to keep the lights on). This, of course, would violate the free exercise and establishment clauses of the First Amendment. So the government has decided, in an abundance of caution, to just not tax any religion for any purpose in the interest of maintaining the separation between church and state and honoring the First Amendment.","The central idea is that the power to tax is the power to destroy. Theoretically, if the government wanted to destroy a disfavored religion (or it decided that all religions were disfavored), it could contrive a reason to impose really burdensome taxes on it to the point where it can no longer function (priests need to eat, and churches need to keep the lights on). This, of course, would violate the free exercise and establishment clauses of the First Amendment. So the government has decided, in abundance of caution, to just not tax any religion for any purpose in the interest of maintaining the separation between church and state and honoring the First Amendment."
3118,-0.229,Maybe substitute poisson for the chickens and work with a baby camel and lamb.Ovbiously the serves 80-100 is going to be less but fun times ahead with only a modest industrial size oven.,Maybe substitute poisson for the chickens and work with a baby camel and lamb.Ovbiously the serves 80-100 is going to be less but fun times ahead with only a modest industrial size oven.,Maybe substitute poisson for the chickens and work with a baby camel and lamb.Ovbiously the serves 80-100 is going to be less but fun times ahead with only a modest industrial size oven.
1035,-0.213,Free mentoring for the intern is a bad thing? I guess you haven't ever wanted to learn anything outside of what you already know. You gotta pay for courses or someone to teach you most of the time.,Free mentoring for the intern is a bad thing? I guess you haven't ever wanted to learn anything outside of what you already know. You gotta pay for courses or someone to teach you most of the time.,Free mentoring for the intern is a bad thing? I guess you have not ever wanted to learn anything outside of what you already know. You got to pay for courses or someone to teach you most of the time.
1831,-0.106,"An open palette means an open mind. Cheesy, but true. Have always ended up cutting things off with picky eaters as well. Food is so important!","An open palette means an open mind. Cheesy, but true. Have always ended up cutting things off with picky eaters as well. Food is so important!","An open palette means an open mind. Cheesy, but true. Have always ended up cutting things off with picky eaters as well. Food is so important!"


In [10]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
3506,-0.522,Best post I have read in a while. Enjoy your life dude. Nothing matters as long as you are happy,Best post I have read in a while. Enjoy your life dude. Nothing matters as long as you are happy,Best post I have read in a while. Enjoy your life [friend]. Nothing matters as long as you are happy
2213,-0.521,Monty Python and the Holy Grail broski,Monty Python and the Holy Grail broski,Monty Python and the Holy Grail broski
2244,-0.438,"The rhino's name is Justin! He's the cutest.\n\nAnd I feel you. I worked all day in the sun there and I'm definitely still feeling overheated lol.\n\nI didn't get to see the giraffe though; super jealous you got so close!!\n\nAnd wait, how are you there now? The Zoo closed at 9 and the Park (where the giraffe is) closed at 7...","The rhino's name is Justin! He's the cutest. And I feel you. I worked all day in the sun there and I'm definitely still feeling overheated lol. I didn't get to see the giraffe though; super jealous you got so close!! And wait, how are you there now? The Zoo closed at 9 and the Park (where the giraffe is) closed at 7...","The rhino's name is Justin! he is the cutest. And I feel you. I worked all day in the sun there and I am definitely still feeling overheated [laughing out loud]. I did not get to see the giraffe though; super jealous you got so close!! And wait, how are you there now? The Zoo closed at 9 and the Park (where the giraffe is) closed at 7..."
1506,-0.438,"I bought a pocket knife for camping and now I use it at home probably once a week. Mostly for opening boxes and cutting up the cardbord for recycling. Also for opening bottles, and as a quick handy screwdriver.","I bought a pocket knife for camping and now I use it at home probably once a week. Mostly for opening boxes and cutting up the cardbord for recycling. Also for opening bottles, and as a quick handy screwdriver.","I bought a pocket knife for camping and now I use it at home probably once a week. Mostly for opening boxes and cutting up the cardbord for recycling. Also for opening bottles, and as a quick handy screwdriver."
4459,-0.333,at the same time the transmission is very nerfed compared to real life.,at the same time the transmission is very nerfed compared to real life.,"at the same time the transmission is very [To make worse or weaken, usually in the context of weakening something in order to balance out a game] compared to real life."
2181,-0.312,"What did, then? The kid shouting?","What did, then? The kid shouting?","What did, then? The kid shouting?"
931,-0.292,"If you really are open to changing your view, consider this: what constitutes a ""genetic defect"" is highly subjective.","If you really are open to changing your view, consider this: what constitutes a ""genetic defect"" is highly subjective.","If you really are open to changing your view, consider this: what constitutes a ""genetic defect"" is highly subjective."
5249,-0.188,"Don't worry, our politicians go to America to lobby.","Don't worry, our politicians go to America to lobby.","do not worry, our politicians go to America to lobby."
3837,-0.083,What about people who post revenge open? Literally exact same thing. Or people who post general voyeur shots?,What about people who post revenge open? Literally exact same thing. Or people who post general voyeur shots?,What about people who post revenge open? Literally exact same thing. Or people who post general voyeur shots?
518,-0.083,"It's up to the morality of journalists how to portray the perpetrator. An interesting option I've seen is TV stations using a strange picture of the Las Vegas shooter, not a glamorous one. However, everything that was not openly published to the public allowed for various conspirational theories to emerge, which is possibly one of the worst outcomes out of a shooting.","It's up to the morality of journalists how to portray the perpetrator. An interesting option I've seen is TV stations using a strange picture of the Las Vegas shooter, not a glamorous one. However, everything that was not openly published to the public allowed for various conspirational theories to emerge, which is possibly one of the worst outcomes out of a shooting.","it is up to the morality of journalists how to portray the perpetrator. An interesting option I have seen is TV stations using a strange picture of the Las Vegas shooter, not a glamorous one. However, everything that was not openly published to the public allowed for various conspirational theories to emerge, which is possibly one of the worst outcomes out of a shooting."


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker", "text", "text1", "text2"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(3)
memory usage: 184.1+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

Wall time: 144 ms
