In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
CPU times: user 46.9 ms, sys: 15.6 ms, total: 62.5 ms
Wall time: 47.1 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [6]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Stage 1: Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


def pre3(row) -> str:
    return mylib.pre3(row["text2"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)
col = "text3"
print(col)
df[col] = df.progress_apply(pre3, axis=1)

text1


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5710/5710 [00:07<00:00, 741.91it/s]


text2


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5710/5710 [00:44<00:00, 129.28it/s]


text3


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5710/5710 [00:42<00:00, 135.16it/s]


In [8]:
cols = ["bws", "text", "text1", "text2", "text3"]
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2,text3
3544,-0.875,So so awesome. Really want to see more.,So so awesome. Really want to see more.,So awesome. Really want to see more.,so awesome really want to see more
2677,-0.617,too many clothes to choose from,too many clothes to choose from,too many clothes to choose from,too many clothe to choose from
1410,-0.542,What are you talking about?,What are you talking about?,What are you talking about?,what be you talk about
1119,-0.458,"Its true, Abraham Hicks, exercise, and meditation helped me a lot.","Its true, Abraham Hicks, exercise, and meditation helped me a lot.","Its true, Abraham Hicks, exercise, and meditation helped me a lot.",its true abraham hicks exercise and meditation help i a lot
2621,-0.191,"> take with me to occupy my time on the train or planes.\n\nThe MSR7 don't isolate that well in loud environments. The 1A would be better from that perspective, comfort, and provides more bass.\n\nThe Oppo PM3 and NAD VISIO HP50 are both good isolators, but I just saw your comment about how overpriced they are, plus the PM3 wouldn't have enough bass if you're worried about bass on the MSR7.\n\nI've had a lot of portables and I think the 1A may be best for you, especially considering reasonable Japan prices. The Momentums are really laid back up top, the the m50x are more v-shaped that the 1A.\n","> take with me to occupy my time on the train or planes. The MSR7 don't isolate that well in loud environments. The 1A would be better from that perspective, comfort, and provides more bass. The Oppo PM3 and NAD VISIO HP50 are both good isolators, but I just saw your comment about how overpriced they are, plus the PM3 wouldn't have enough bass if you're worried about bass on the MSR7. I've had a lot of portables and I think the 1A may be best for you, especially considering reasonable Japan prices. The Momentums are really laid back up top, the the m50x are more v-shaped that the 1A.","> take with me to occupy my time on the train or planes. The MSR7 do not isolate that well in loud environments. The 1A would better from that perspective, comfort, and provides more bass. The Oppo PM3 and NAD VISIO HP50 are both good isolators, but I just saw your comment about how overpriced they are, plus the PM3 would not have enough bass if you are worried about bass on the MSR7. I have had a lot of portables and I think the 1A may best for you, especially considering reasonable Japan prices. The Momentums are really laid back up top, the m50x are more v-shaped that the 1A.",take with i to occupy my time on the train or plane the msr7 do not isolate that well in loud environment the 1a would well from that perspective comfort and provide more bass the oppo pm3 and nad visio hp50 be both good isolator but i just see your comment about how overpriced they be plus the pm3 would not have enough bass if you be worried about bass on the msr7 i have have a lot of portable and i think the 1a may well for you especially consider reasonable japan price the momentums be really lay back up top the m50x be more v shape that the 1a
3064,-0.188,I think they mean ‘uncomfortable’ as in the comedy style of between two ferns.,I think they mean 'uncomfortable' as in the comedy style of between two ferns.,I think they mean 'uncomfortable' as in the comedy style of between two ferns.,i think they mean uncomfortable as in the comedy style of between two fern
4356,0.0,"Genuine question, can you take out life insurance on an unvaccinated baby? I doubt insurers would take that obvious risk","Genuine question, can you take out life insurance on an unvaccinated baby? I doubt insurers would take that obvious risk","Genuine question, can you take out life insurance on an unvaccinated baby? I doubt insurers would take that obvious risk",genuine question can you take out life insurance on an unvaccinated baby i doubt insurer would take that obvious risk
369,0.0,"I firmly disagree. A cost is a cost, it doesn't matter if it's up front or spread out, doesn't make a huge difference. In fact, it's easier for most people/organizations to pay a cost spread out over than all up front. Regardless, gun ownership does have a recurring cost in license renewal. \n\nThe analogy doesn't need to be 100% the same, I see no significant differences. \n\nAnother analogy that might suit you better is that in order to assemble in protest you need a permit, and in many places that comes with a non-refundable application fee. Here is an example of the government actively putting a paywall in-front of exercising a first amendment right.","I firmly disagree. A cost is a cost, it doesn't matter if it's up front or spread out, doesn't make a huge difference. In fact, it's easier for most people/organizations to pay a cost spread out over than all up front. Regardless, gun ownership does have a recurring cost in license renewal. The analogy doesn't need to be 100% the same, I see no significant differences. Another analogy that might suit you better is that in order to assemble in protest you need a permit, and in many places that comes with a non-refundable application fee. Here is an example of the government actively putting a paywall in-front of exercising a first amendment right.","I firmly disagree. A cost is a cost, it does not matter if it is up front or spread out, does not make a huge difference. In fact, it is easier for most people/organizations to pay a cost spread out over than all up front. Regardless, gun ownership does have a recurring cost in license renewal. The analogy does not need to be 100% the same, I see no significant differences. Another analogy that might suit you better is that in order to assemble in protest you need a permit, and in many places that comes with a non-refundable application fee. Here is an example of the government actively putting a paywall in-front of exercising a first amendment right.",i firmly disagree a cost be a cost it do not matter if it be up front or spread out do not make a huge difference in fact it be easy for most people organization to pay a cost spread out over than all up front regardless gun ownership do have a recur cost in license renewal the analogy do not need to be 100 the same i see no significant difference another analogy that might suit you well be that in order to assemble in protest you need a permit and in many place that come with a non refundable application fee here be an example of the government actively put a paywall in front of exercise a first amendment right
4945,0.0,What the hell did I just read.,What the hell did I just read.,What the hell did I just read.,what the hell do i just read
4313,0.021,Show her that Jim Jeffries piece on autism,Show her that Jim Jeffries piece on autism,Show her that Jim Jeffries piece on autism,show she that jim jeffries piece on autism


In [9]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2,text3
3073,-0.396,Gordon Keith has a calm dark sense of humor. He is a radio host during the day with The Ticket and does these type of interviews as a side gig.,Gordon Keith has a calm dark sense of humor. He is a radio host during the day with The Ticket and does these type of interviews as a side gig.,Gordon Keith has a calm dark sense of humor. He is a radio host during the day with The Ticket and does these type of interviews a side gig.,gordon keith have a calm dark sense of humor he be a radio host during the day with the ticket and do these type of interview a side gig
5095,-0.375,How else are ya gonna eat all that poutine?,How else are ya gonna eat all that poutine?,How else are ya going to eat all that poutine?,how else be ya go to eat all that poutine
4658,-0.34,At least part of it if I was walking the same 30 miles for some reason. The bigger issue is he was obviously there as part of some humanitarian aid and so likely had transportation. So to watch her carry the kid he either drove next to her or was going on a random 30 mile stroll.,At least part of it if I was walking the same 30 miles for some reason. The bigger issue is he was obviously there as part of some humanitarian aid and so likely had transportation. So to watch her carry the kid he either drove next to her or was going on a random 30 mile stroll.,At least part of it if I was walking the same 30 miles for some reason. The bigger issue is he was obviously there as part of some humanitarian aid and so likely had transportation. So to watch her carry the kid he either drove next to her or was going on a random 30 mile stroll.,at least part of it if i be walk the same 30 mile for some reason the big issue be he be obviously there as part of some humanitarian aid and so likely have transportation so to watch she carry the kid he either drive next to she or be go on a random 30 mile stroll
3507,-0.312,"Well what is weird is subjective. Everyone has a different definition of what is normal and what is weird. As long as she is happy and you truly are happy, then what else matters? Your own happiness is absolutely much more of a priority than what other people think. When we find partners, we are finding a match - which means you both take on a role in each other’s lives that the both of you embrace.","Well what is weird is subjective. Everyone has a different definition of what is normal and what is weird. As long as she is happy and you truly are happy, then what else matters? Your own happiness is absolutely much more of a priority than what other people think. When we find partners, we are finding a match - which means you both take on a role in each other's lives that the both of you embrace.","Well what is weird is subjective. Everyone has a different definition of what is normal and what is weird. As long as she is happy and you truly are happy, then what else matters? Your own happiness is absolutely much more of a priority than what other people think. When we find partners, we are finding a match - which means you both take on a role in each other's lives that the both of you embrace.",well what be weird be subjective everyone have a different definition of what be normal and what be weird as long as she be happy and you truly be happy then what else matter your own happiness be absolutely much more of a priority than what other people think when we find partner we be find a match which mean you both take on a role in each other s life that the both of you embrace
4829,-0.213,"The Government gets paid in tax money, the people IN government have multiple sources of income.","The Government gets paid in tax money, the people IN government have multiple sources of income.","The Government gets paid in tax money, the people IN government have multiple sources of income.",the government gets pay in tax money the people in government have multiple source of income
2284,-0.188,"Not to mention it isn’t exactly suddenly, he’s been learning from the force alone for years","Not to mention it isn't exactly suddenly, he's been learning from the force alone for years","Not to mention it is not exactly suddenly, he is been learning from the force alone for years",not to mention it be not exactly suddenly he be be learn from the force alone for year
4659,-0.146,"It's not like I was joking or anything... I mean, who would take a serious subject and make light of it because of awkward wording?","It's not like I was joking or anything... I mean, who would take a serious subject and make light of it because of awkward wording?","it is not like I was joking or anything... I mean, who would take a serious subject and make light of it because of awkward wording?",it be not like i be joke or anything i mean who would take a serious subject and make light of it because of awkward wording
4730,-0.125,I personally can't believe this title. The Doctor has traveled enough through time and space that he would very well know that vaccines don't cause autism...,I personally can't believe this title. The Doctor has traveled enough through time and space that he would very well know that vaccines don't cause autism...,I personally cannot believe this title. The Doctor has traveled enough through time and space that he would very well know that vaccines do not cause autism...,i personally can not believe this title the doctor have travel enough through time and space that he would very well know that vaccine do not cause autism
1744,-0.104,"Yeah, if a woman seriously began hindering a guys social life and became controlling, I'd hope that guy would be able to think for himself and put his foot down. That sort of thing is scary.","Yeah, if a woman seriously began hindering a guys social life and became controlling, I'd hope that guy would be able to think for himself and put his foot down. That sort of thing is scary.","Yeah, if a woman seriously began hindering a guys social life and became controlling, I would hope that guy would be able to think for himself and put his foot down. That sort of thing is scary.",yeah if a woman seriously begin hinder a guy social life and became control i would hope that guy would be able to think for himself and put his foot down that sort of thing be scary
4691,-0.104,In Belgium we don't vaccinate for chicken pox... But if my kids don't get it naturally by the time they're 5-6 I will go vaccinate them.,In Belgium we don't vaccinate for chicken pox... But if my kids don't get it naturally by the time they're 5-6 I will go vaccinate them.,In Belgium we do not vaccinate for chicken pox... But if my kids do not get it naturally by the time they are 5-6 I will go vaccinate them.,in belgium we do not vaccinate for chicken pox but if my kid do not get it naturally by the time they be 5 6 i will go vaccinate they


In [10]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2,text3
3657,-0.667,Thanks and good luck! You can do it!!,Thanks and good luck! You can do it!!,Thanks and good luck! You can do it!!,thank and good luck you can do it
5144,-0.562,This is so sexy! Love it!,This is so sexy! Love it!,This so sexy! Love it!,this so sexy love it
4040,-0.458,You deserve a fresh start! Enjoy what comes when the pain dissipates.,You deserve a fresh start! Enjoy what comes when the pain dissipates.,You deserve a fresh start! Enjoy what comes when the pain dissipates.,you deserve a fresh start enjoy what come when the pain dissipate
4431,-0.375,Adults that think they can meme politics,Adults that think they can meme politics,"Adults that think they can [an image, video, piece of text, etc., typically humorous in nature and shared widely] politics",adult that think they can an image video piece of text etc typically humorous in nature and share widely politic
4187,-0.375,he's a waitress - progress?,he's a waitress - progress?,he is a waitress - progress?,he be a waitress progress
2922,-0.326,If i had a penny for every time ive visited the farm. Id have 3 pennys.,If i had a penny for every time ive visited the farm. Id have 3 pennys.,If i had a penny for every time ive visited the farm. Id have 3 pennys.,if i have a penny for every time i ve visit the farm i d have 3 penny
2404,-0.25,People with viral posts become incapacitated\n\nEdit: Wakes up incapacitated,People with viral posts become incapacitated Edit: Wakes up incapacitated,People with viral posts become incapacitated Edit: Wakes up incapacitated,people with viral post become incapacitated edit wake up incapacitated
3636,-0.25,He has the arms of a folding chair,He has the arms of a folding chair,He has the arms of a folding chair,he have the arm of a fold chair
4148,-0.125,180 hours is like a week and a half. No one is working that long in one go. I'm on call 24/7. The only time I'm not on call is if I'm on a vacation. I don't plan on being hospitalized any time soon. She should have managed her downtime better.,180 hours is like a week and a half. No one is working that long in one go. I'm on call 24/7. The only time I'm not on call is if I'm on a vacation. I don't plan on being hospitalized any time soon. She should have managed her downtime better.,180 hours is like a week and a half. No one is working that long in one go. I am on call 24/7. The only time I am not on call is if I am on a vacation. I do not plan on being hospitalized any time soon. She should have managed her downtime better.,180 hour be like a week and a half no one be work that long in one go i be on call 24 7 the only time i be not on call be if i be on a vacation i do not plan on be hospitalize any time soon she should have manage her downtime well
5646,-0.062,"""Flex""? It's a ""flex"" to acknowledge the blame is with corrupt capitalism in government, and that ""communists"" have absolutely nothing to do with it?","""Flex""? It's a ""flex"" to acknowledge the blame is with corrupt capitalism in government, and that ""communists"" have absolutely nothing to do with it?","""Flex""? it is a ""flex"" to acknowledge the blame is with corrupt capitalism in government, and that ""communists"" have absolutely nothing to do with it?",flex it be a flex to acknowledge the blame be with corrupt capitalism in government and that communist have absolutely nothing to do with it


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker", "text", "text1", "text2", "text3"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
 6   text3   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(4)
memory usage: 228.7+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

CPU times: user 46.9 ms, sys: 15.6 ms, total: 62.5 ms
Wall time: 66.6 ms
