In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 51.3 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [6]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Stage 1: Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)

text1


100%|██████████████████████████████████████████| 5710/5710 [00:09<00:00, 591.72it/s]


text2


100%|██████████████████████████████████████████| 5710/5710 [00:50<00:00, 112.00it/s]


In [8]:
cols = ["bws", "text", "text1", "text2"]
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
2591,-0.625,That delay makes me even more hyped! :P,"That delay makes me even more hyped! [Tongue sticking out, cheeky, playful or blowing a raspberry]","That delay makes me even more hyped! [Tongue sticking out, cheeky, playful or blowing a raspberry]"
3348,-0.5,He just starts running away,He just starts running away,He just starts running away
249,-0.375,"There are 18 year old high school students, and there are also colleges and universities where legal adults attend school.","There are 18 year old high school students, and there are also colleges and universities where legal adults attend school.","There are 18 year old high school students, and there are also colleges and universities where legal adults attend school."
2998,-0.34,"`LD_PRELOAD=/lib/libc.so.x.y.z-old mv /lib/libc.so.x.y.z-old /lib/libc.so.x.y.z`\n\nNow, when you lose access to `/usr/lib/ld-x.y.so`, **that's** when the real fun starts.","'LD_PRELOA [Great dismay] /lib/libc.so.x.y.z-old mv /lib/libc.so.x.y.z-old /lib/libc.so.x.y.z' Now, when you lose access to '/usr/lib/ld-x.y.so', **that's** when the real fun starts.","'LD_PRELOA [Great dismay] /lib/libc.so.x.y.z-old mv /lib/libc.so.x.y.z-old /lib/libc.so.x.y.z' Now, when you lose access to '/usr/lib/ld-x.y.so', **that is** when the real fun starts."
4899,-0.34,But then how will he participate in the meme wars?,But then how will he participate in the meme wars?,"But then how will he participate in the [an image, video, piece of text, etc., typically humorous in nature and shared widely] wars?"
414,-0.333,"You raise some interesting points, but I think they could raise doubts about the child getting infected elsewhere in public (grocery store, church, etc)","You raise some interesting points, but I think they could raise doubts about the child getting infected elsewhere in public (grocery store, church, etc)","You raise some interesting points, but I think they could raise doubts about the child getting infected elsewhere in public (grocery store, church, etc)"
3119,-0.333,Camel meat is actually pretty commonly eaten in the middle east.,Camel meat is actually pretty commonly eaten in the middle east.,Camel meat is actually pretty commonly eaten in the middle east.
3604,-0.292,Was this a post on Reddit or did everyone get this ad? I’m confused.,Was this a post on Reddit or did everyone get this ad? I'm confused.,Was this a post on Reddit or did everyone get this ad? I am confused.
2265,-0.25,That JK Rowling despite having all that wealth most of the time looks miserable.,That JK Rowling despite having all that wealth most of the time looks miserable.,That [just kidding] Rowling despite having all that wealth most of the time looks miserable.
2287,-0.188,yeah it's not like they're mosquitos,yeah it's not like they're mosquitos,yeah it is not like they are mosquitos


In [9]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
2317,-0.667,The Bus is Late by Satellite High,The Bus is Late by Satellite High,The Bus is Late by Satellite High
2977,-0.354,"Nixos does atomic upgrades, can install and use multiple versions of the same package also.","Nixos does atomic upgrades, can install and use multiple versions of the same package also.","Nixos does atomic upgrades, can install and use multiple versions of the same package also."
4534,-0.312,"That poor little baby, this measles crisis is getting way out of hand. I hope she's okay now. 🙁","That poor little baby, this measles crisis is getting way out of hand. I hope she's okay now. (slightly frowning face)","That poor little baby, this measles crisis is getting way out of hand. I hope she is okay now. (slightly frowning face)"
4294,-0.292,The government also made it law we wear seat belts. How is this different?\n\nThe government dictates a lot of things with warning labels or whatever. This should be law anyway.,The government also made it law we wear seat belts. How is this different? The government dictates a lot of things with warning labels or whatever. This should be law anyway.,The government also made it law we wear seat belts. How is this different? The government dictates a lot of things with warning labels or whatever. This should be law anyway.
2305,-0.208,"Then invest that money into one hell of a lemonade stand, hire somebody to oversee production, and parlay it into a national chain.","Then invest that money into one hell of a lemonade stand, hire somebody to oversee production, and parlay it into a national chain.","Then invest that money into one hell of a lemonade stand, hire somebody to oversee production, and parlay it into a national chain."
2287,-0.188,yeah it's not like they're mosquitos,yeah it's not like they're mosquitos,yeah it is not like they are mosquitos
1073,-0.146,Maybe you should be more open to new experiences. Trying something doesn't mean you'll be pounding 40's erry night.,Maybe you should be more open to new experiences. Trying something doesn't mean you'll be pounding 40's erry night.,Maybe you should be more open to new experiences. Trying something does not mean you will be pounding 40's erry night.
2439,-0.104,Yeah but you can be courteous without having to outwardly state it all the time so people know you're a good person. That's what I'm getting at.,Yeah but you can be courteous without having to outwardly state it all the time so people know you're a good person. That's what I'm getting at.,Yeah but you can be courteous without having to outwardly state it all the time so people know you are a good person. that is what I am getting at.
2062,-0.064,"That's like saying, ""Men of Reddit, while your ladies aren't around, what are your thoughts on the Kennedy assassination and 9/11?""\n\nWhat's with these random af Ask AskReddit questions?","That's like saying, ""Men of Reddit, while your ladies aren't around, what are your thoughts on the Kennedy assassination and 9/11?"" What's with these random af Ask AskReddit questions?","that is like saying, ""Men of Reddit, while your ladies are not around, what are your thoughts on the Kennedy assassination and 9/11?"" what is with these random [as fuck; expression of annoyance, contempt or impatience] Ask AskReddit questions?"
899,-0.062,"Whipped cream is so much more versatile when it comes to flavoring. If you make it yourself you can add: coffee syrup, bourbon, maple syrup, orange zest, vanilla, dark rum, smore shnops, or whatever flavors you want to add. With marshmallows you get marshmallow flavor and that's it. \n\nRegardless, of the arguments presented here, including mine, I'm going to challenge your premise that there is a best hot chocolate topper that works for everyone. That's a bullshit premise. It's your hot chocolate, put whatever the hell you want into it, and let other people have their hot chocolate however they want it. To each their own, but don't be afraid to try out someone else's way just in case they're on to something.","Whipped cream is so much more versatile when it comes to flavoring. If you make it yourself you can add: coffee syrup, bourbon, maple syrup, orange zest, vanilla, dark rum, smore shnops, or whatever flavors you want to add. With marshmallows you get marshmallow flavor and that's it. Regardless, of the arguments presented here, including mine, I'm going to challenge your premise that there is a best hot chocolate topper that works for everyone. That's a bullshit premise. It's your hot chocolate, put whatever the hell you want into it, and let other people have their hot chocolate however they want it. To each their own, but don't be afraid to try out someone else's way just in case they're on to something.","Whipped cream is so much more versatile when it comes to flavoring. If you make it yourself you can add: coffee syrup, bourbon, maple syrup, orange zest, vanilla, dark rum, smore shnops, or whatever flavors you want to add. With marshmallows you get marshmallow flavor and that is it. Regardless, of the arguments presented here, including mine, I am going to challenge your premise that there is a best hot chocolate topper that works for everyone. that is a bullshit premise. it is your hot chocolate, put whatever the hell you want into it, and let other people have their hot chocolate however they want it. To each their own, but do not be afraid to try out someone else's way just in case they are on to something."


In [10]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
2636,-0.34,Xiaomi Hybrid has a nice and good sounding v-shape to it. I like mine a lot for electronic music and anything that comes with a punchy bass.\n\nHaven't heard anything good about the Youth edition.,Xiaomi Hybrid has a nice and good sounding v-shape to it. I like mine a lot for electronic music and anything that comes with a punchy bass. Haven't heard anything good about the Youth edition.,"Xiaomi Hybrid has a nice and good sounding v-shape to it. I like mine a lot for electronic music and anything that comes with a [having punch; forceful, spirited] bass. have not heard anything good about the Youth edition."
5383,-0.333,> Because they wont release the data to her.\n\n[citation needed]\n\n,> Because they wont release the data to her. [citation needed],> Because they wont release the data to her. [citation needed]
3089,-0.312,My experience is that most mathematicians are some sort of platonist hybrids and accept that the numbers are real in their own sense.,My experience is that most mathematicians are some sort of platonist hybrids and accept that the numbers are real in their own sense.,My experience is that most mathematicians are some sort of platonist hybrids and accept that the numbers are real in their own sense.
2356,-0.292,It’s cringy but a gamer is willing to take that risk. You have no idea how many external storage things I’d get along with what would seem like thousands of games,It's cringy but a gamer is willing to take that risk. You have no idea how many external storage things I'd get along with what would seem like thousands of games,it is cringy but a gamer is willing to take that risk. You have no idea how many external storage things I would get along with what would seem like thousands of games
413,-0.292,"Yes. Morally, if not legally, you absolutely are responsible.","Yes. Morally, if not legally, you absolutely are responsible.","Yes. Morally, if not legally, you absolutely are responsible."
3231,-0.271,"I got to teach someone how to play it in high school, and it was so much fun. She didn't believe she'd ever be fast at it like I was, but of course, it just took a few days for her to get there. I really hadn't thought there was anyone my age who didn't know, because we were in the perfect range where we grew up with the game.\n\nI guess you're one of today's lucky 10,000!","I got to teach someone how to play it in high school, and it was so much fun. She didn't believe she'd ever be fast at it like I was, but of course, it just took a few days for her to get there. I really hadn't thought there was anyone my age who didn't know, because we were in the perfect range where we grew up with the game. I guess you're one of today's lucky 10,000!","I got to teach someone how to play it in high school, and it was so much fun. She did not believe she would ever be fast at it like I was, but of course, it just took a few days for her to get there. I really had not thought there was anyone my age who did not know, because we were in the perfect range where we grew up with the game. I guess you are one of today's lucky 10,000!"
4057,-0.191,"I wouldn't be surprised if an over-zealous manager has purposefully ""misplaced"" them somewhere in the stockroom... Or the dumpster.","I wouldn't be surprised if an over-zealous manager has purposefully ""misplaced"" them somewhere in the stockroom... Or the dumpster.","I would not be surprised if an over-zealous manager has purposefully ""misplaced"" them somewhere in the stockroom... Or the dumpster."
3927,-0.188,"I had one a year ago. I don't regret it for many reasons, mainly because of the person the father turned out to be, but it was either be sad but keep going or be sad with a new baby (which made me feel like Id turn out to be more sad). If its what you need to do, or even just what you want to do, its your body and your rules. I know I made the right choice, but I still feel the sad. You come to accept it. Its okay.","I had one a year ago. I don't regret it for many reasons, mainly because of the person the father turned out to be, but it was either be sad but keep going or be sad with a new baby (which made me feel like Id turn out to be more sad). If its what you need to do, or even just what you want to do, its your body and your rules. I know I made the right choice, but I still feel the sad. You come to accept it. Its okay.","I had one a year ago. I do not regret it for many reasons, mainly because of the person the father turned out to be, but it was either be sad but keep going or be sad with a new baby (which made me feel like Id turn out to be more sad). If its what you need to do, or even just what you want to do, its your body and your rules. I know I made the right choice, but I still feel the sad. You come to accept it. Its okay."
4266,-0.167,Zombie fallout by Mark Tufo is about this very thing,Zombie fallout by Mark Tufo is about this very thing,Zombie fallout by Mark Tufo is about this very thing
27,-0.109,"There is a culture of exclusion in many higher paying fields like tech and science wherein women don't feel like they belong, are equals, etc. to men. And so some elect to enter another field after experiencing this directly, whereas others opt never to enter such fields in the first place and take up lesser paying jobs. \n\nSociological forces are at play, is my point. Also, the burden of children is something you shouldn't neglect. There are all sorts of reasons why a child came unplanned into her life, something which had a detrimental impact on her potential job earnings. And if you count the total number of caloric hours of labor undertaken by men and women, women work between 20 to 30 percent more per week than men, on average. Because, yes, child care and home making is labor, albeit unpaid.","There is a culture of exclusion in many higher paying fields like tech and science wherein women don't feel like they belong, are equals, etc. to men. And so some elect to enter another field after experiencing this directly, whereas others opt never to enter such fields in the first place and take up lesser paying jobs. Sociological forces are at play, is my point. Also, the burden of children is something you shouldn't neglect. There are all sorts of reasons why a child came unplanned into her life, something which had a detrimental impact on her potential job earnings. And if you count the total number of caloric hours of labor undertaken by men and women, women work between 20 to 30 percent more per week than men, on average. Because, yes, child care and home making is labor, albeit unpaid.","There is a culture of exclusion in many higher paying fields like tech and science wherein women do not feel like they belong, are equals, etc. to men. And so some elect to enter another field after experiencing this directly, whereas others opt never to enter such fields in the first place and take up lesser paying jobs. Sociological forces are at play, is my point. Also, the burden of children is something you should not neglect. There are all sorts of reasons why a child came unplanned into her life, something which had a detrimental impact on her potential job earnings. And if you count the total number of caloric hours of labor undertaken by men and women, women work between 20 to 30 percent more per week than men, on average. Because, yes, child care and home making is labor, albeit unpaid."


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker", "text", "text1", "text2"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(3)
memory usage: 184.1+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

Wall time: 40 ms
