In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 51 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [6]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Stage 1: Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)

text1


100%|██████████████████████████████████████████| 5710/5710 [00:06<00:00, 910.46it/s]


text2


100%|██████████████████████████████████████████| 5710/5710 [00:31<00:00, 178.46it/s]


In [13]:
cols = ["bws", "text", "text1", "text2"]
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
2408,-0.604,Took me way too long to understand that this question arose from the username.,Took me way too long to understand that this question arose from the username.,Took me way too long to understand that this question arose from the username.
3415,-0.468,It’s not that convenient,It's not that convenient,it is not that convenient
1018,-0.319,I appreciate your comment because it strikes at the heart of the problem. Student loans are the real class barrier.,I appreciate your comment because it strikes at the heart of the problem. Student loans are the real class barrier.,I appreciate your comment because it strikes at the heart of the problem. Student loans are the real class barrier.
2393,-0.255,"Not the driver, but a witness\n\nOne day I was in vacation in São Paulo, there were at least six people with me, all of them were family. We clearly wouldn't be able to fit in a single cab, so we called two. Me, my dad and my cousin went in the second cab, when we got in, my dad said:\n\n""I always wanted to say this, but follow that car!""","Not the driver, but a witness One day I was in vacation in Sao Paulo, there were at least six people with me, all of them were family. We clearly wouldn't be able to fit in a single cab, so we called two. Me, my dad and my cousin went in the second cab, when we got in, my dad said: ""I always wanted to say this, but follow that car!""","Not the driver, but a witness One day I was in vacation in Sao Paulo, there were at least six people with me, all of them were family. We clearly would not be able to fit in a single cab, so we called two. Me, my dad and my cousin went in the second cab, when we got in, my dad said: ""I always wanted to say this, but follow that car!"""
1134,-0.239,"Yeah I can see the harassment angle, but it just feels like it would be too infrequent to be a real issue.","Yeah I can see the harassment angle, but it just feels like it would be too infrequent to be a real issue.","Yeah I can see the harassment angle, but it just feels like it would be too infrequent to be a real issue."
2700,-0.234,Try living in Myanmar... Oh my god the internet..,Try living in Myanmar... Oh my god the internet..,Try living in Myanmar... Oh my god the internet..
932,-0.149,I think animal breeeding without some kind of license and oversight should be outlawed. Violators pay a fine that covers the cost of the oversight with the remainder going to animal shelters and advocacy.,I think animal breeeding without some kind of license and oversight should be outlawed. Violators pay a fine that covers the cost of the oversight with the remainder going to animal shelters and advocacy.,I think animal breeeding without some kind of license and oversight should be outlawed. Violators pay a fine that covers the cost of the oversight with the remainder going to animal shelters and advocacy.
3501,-0.104,"I’m in a similar situation with my male partner, I don’t think it needs to be a gendered thing anymore, having a situation where someone has more time or willingness to do those types of things is a great situation to be in. My partner often says he’d make a great “trophy husband” should I ever make it to the top of my career. We balance each other’s roles perfectly","I'm in a similar situation with my male partner, I don't think it needs to be a gendered thing anymore, having a situation where someone has more time or willingness to do those types of things is a great situation to be in. My partner often says he'd make a great ""trophy husband"" should I ever make it to the top of my career. We balance each other's roles perfectly","I am in a similar situation with my male partner, I do not think it needs to be a gendered thing anymore, having a situation where someone has more time or willingness to do those types of things is a great situation to be in. My partner often says he would make a great ""trophy husband"" should I ever make it to the top of my career. We balance each other's roles perfectly"
1323,-0.083,"How would a parent know what works? Is the goal to stop the child doing what they are doing in the moment or to create a well adjusted adult? If both, what percentage would you place on those. \n\nHow do you spank correctly? And how do you measure efficiency? \n\nI agree generally that spanking isn’t always abusive but I have a lot of doubts about how effective it is and how good parents would be at knowing what is effective.","How would a parent know what works? Is the goal to stop the child doing what they are doing in the moment or to create a well adjusted adult? If both, what percentage would you place on those. How do you spank correctly? And how do you measure efficiency? I agree generally that spanking isn't always abusive but I have a lot of doubts about how effective it is and how good parents would be at knowing what is effective.","How would a parent know what works? Is the goal to stop the child doing what they are doing in the moment or to create a well adjusted adult? If both, what percentage would you place on those. How do you spank correctly? And how do you measure efficiency? I agree generally that spanking is not always abusive but I have a lot of doubts about how effective it is and how good parents would be at knowing what is effective."
2858,-0.062,The AP called CA for Clinton the day before primary voting... It was very disheartening.,The AP called CA for Clinton the day before primary voting... It was very disheartening.,The AP called CA for Clinton the day before primary voting... It was very disheartening.


In [9]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
5227,-0.532,Now I regret not visiting it.,Now I regret not visiting it.,Now I regret not visiting it.
4893,-0.522,"🎵And we 'bout it every day, every day, every day 🎵\n\n","(musical note) And we 'bout it every day, every day, every day (musical note)","(musical note) And we 'bout it every day, every day, every day (musical note)"
3168,-0.489,too real man...too real.,too real man...too real.,too real man...too real.
439,-0.378,I have no interest in changing your view because you are correct,I have no interest in changing your view because you are correct,I have no interest in changing your view because you are correct
431,-0.375,Just had my post removed for speaking truth . Lmao 😄,Just had my post removed for speaking truth . Lmao (grinning face with smiling eyes),Just had my post removed for speaking truth . [laughing my ass off] (grinning face with smiling eyes)
1820,-0.312,I started lifting and taking care of myself late. The way women treat me changed 180. Even guys are doing me favours now.,I started lifting and taking care of myself late. The way women treat me changed 180. Even guys are doing me favours now.,I started lifting and taking care of myself late. The way women treat me changed 180. Even guys are doing me favours now.
1116,-0.167,I don’t know if hey are becoming a culture as much as we are becoming more aware of them and their effects. I am in adult leadership in cub scouts and Boy Scouts and we see kids with high anxiety and now that we know what that is we know how to properly deal with it and treat it. \n\nWhen I was a kid in the 80s these kids were just called “bad” and were never given a real chance.,"I don't know if hey are becoming a culture as much as we are becoming more aware of them and their effects. I am in adult leadership in cub scouts and Boy Scouts and we see kids with high anxiety and now that we know what that is we know how to properly deal with it and treat it. When I was a kid in the 80s these kids were just called ""bad"" and were never given a real chance.","I do not know if hey are becoming a culture as much as we are becoming more aware of them and their effects. I am in adult leadership in cub scouts and Boy Scouts and we see kids with high anxiety and now that we know what that is we know how to properly deal with it and treat it. When I was a kid in the 80s these kids were just called ""bad"" and were never given a real chance."
1388,-0.167,"Hell, I'm not American and I also miss Obama lol. The world was a lot more peaceful back then.","Hell, I'm not American and I also miss Obama lol. The world was a lot more peaceful back then.","Hell, I am not American and I also miss Obama [laughing out loud]. The world was a lot more peaceful back then."
2806,-0.146,It's a damn shame. The VICE documentaries from crazy warzones were solid gold. Some of the best on-the-ground journalism I've ever seen. \n\nNow the whole site is becoming Buzzfeed with better PR.,It's a damn shame. The VICE documentaries from crazy warzones were solid gold. Some of the best on-the-ground journalism I've ever seen. Now the whole site is becoming Buzzfeed with better PR.,it is a damn shame. The VICE documentaries from crazy warzones were solid gold. Some of the best on-the-ground journalism I have ever seen. Now the whole site is becoming Buzzfeed with better PR.
1227,-0.125,"Obesity is essentially an addiction to food. The obese get a greater dopamine kick from eating than the average person which makes it extremely difficult for them to eat within a healthy range. But here's the real trouble: unlike every other addiction, there is ZERO possibility of abstinence - the only way to truly end an addiction. Imagine a heroin addict who needs to give himself a tiny dose of heroin every day to stay alive. Like all addictions, it's partly moral choice, but there is a massive unchosen biological component they can't do a thing about.","Obesity is essentially an addiction to food. The obese get a greater dopamine kick from eating than the average person which makes it extremely difficult for them to eat within a healthy range. But here's the real trouble: unlike every other addiction, there is ZERO possibility of abstinence - the only way to truly end an addiction. Imagine a heroin addict who needs to give himself a tiny dose of heroin every day to stay alive. Like all addictions, it's partly moral choice, but there is a massive unchosen biological component they can't do a thing about.","Obesity is essentially an addiction to food. The obese get a greater dopamine kick from eating than the average person which makes it extremely difficult for them to eat within a healthy range. But here is the real trouble: unlike every other addiction, there is ZERO possibility of abstinence - the only way to truly end an addiction. Imagine a heroin addict who needs to give himself a tiny dose of heroin every day to stay alive. Like all addictions, it is partly moral choice, but there is a massive unchosen biological component they cannot do a thing about."


In [14]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
2708,-0.468,"Michigan, Clemson, or Washington will be outside the top 4 for either Louisville, Ohio State, or Texas A&M","Michigan, Clemson, or Washington will be outside the top 4 for either Louisville, Ohio State, or Texas A&M","Michigan, Clemson, or Washington will be outside the top 4 for either Louisville, Ohio State, or Texas A&M"
98,-0.468,"Obviously I'm not looking to test this theory out in real life, but I'm pretty awesome in my daydreams.","Obviously I'm not looking to test this theory out in real life, but I'm pretty awesome in my daydreams.","Obviously I am not looking to test this theory out in real life, but I am pretty awesome in my daydreams."
1757,-0.438,Thy're not particulary religious.\n\nI guess that's just how they roll.,Thy're not particulary religious. I guess that's just how they roll.,Thy're not particulary religious. I guess that is just how they roll.
1332,-0.417,Tradition isn't evidence. We used to lots of things that would be considered abhorrent today.,Tradition isn't evidence. We used to lots of things that would be considered abhorrent today.,Tradition is not evidence. We used to lots of things that would be considered abhorrent today.
2080,-0.396,This was the late 90's so maybe the view had shifted a bit.,This was the late 90's so maybe the view had shifted a bit.,This was the late 90's so maybe the view had shifted a bit.
3142,-0.271,Let me know when k0nfig is IGLing.,Let me know when k0nfig is IGLing.,Let me know when k0nfig is IGLing.
2404,-0.25,People with viral posts become incapacitated\n\nEdit: Wakes up incapacitated,People with viral posts become incapacitated Edit: Wakes up incapacitated,People with viral posts become incapacitated Edit: Wakes up incapacitated
299,-0.208,"I would suggest that its fine if localities help fund them, but if the team wishes that to happen they should give the city a stake in the franchise. That way costs are mitigated by the profit.","I would suggest that its fine if localities help fund them, but if the team wishes that to happen they should give the city a stake in the franchise. That way costs are mitigated by the profit.","I would suggest that its fine if localities help fund them, but if the team wishes that to happen they should give the city a stake in the franchise. That way costs are mitigated by the profit."
5333,-0.146,Part of the Chavista movement is coopting Simon Bolivar. They call their movement the 'bolivarian revolution' or something like that.,Part of the Chavista movement is coopting Simon Bolivar. They call their movement the 'bolivarian revolution' or something like that.,Part of the Chavista movement is coopting Simon Bolivar. They call their movement the 'bolivarian revolution' or something like that.
957,-0.104,Who’s going to allow them? The wives that “aren’t providing enough sex”?,"Who's going to allow them? The wives that ""aren't providing enough sex""?","who is going to allow them? The wives that ""are not providing enough sex""?"


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker", "text", "text1", "text2"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(3)
memory usage: 184.1+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

Wall time: 40 ms
