In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 49 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [6]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Stage 1: Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


col = "text1"
df[col] = df.progress_apply(pre1, axis=1)

100%|██████████████████████████████████████████| 5710/5710 [00:08<00:00, 670.86it/s]


In [8]:
cols = ["bws", "text", "text1"]
df[cols].sample(30).sort_values("bws").head(30)

Unnamed: 0,bws,text,text1
2365,-0.354,It has 409 upvotes as of now and only 60 comments. That means 349 people are lurkers to this post.\nEdit:speling,It has 409 upvotes as of now and only 60 comments. That means 349 people are lurkers to this post. Edit:speling
2974,-0.304,It's a shame OSes don't do atomic updates yet but you should be able to press ctrl+alt+F2 and finish the update from the cli,it is a shame OSes do not do atomic updates yet but you should be able to press ctrl+alt+F2 and finish the update from the cli
5029,-0.271,"Well right now, I have neither. For the last 20 years I've had neither.\n\nThe world has kept on spinning.","Well right now, I have neither. For the last 20 years I have had neither. The world has kept on spinning."
2150,-0.25,I have an extremely mild allergy to bee stings that gets worse every time I’m stung by a bee. (Three stings over my lifetime so far). It now takes a couple of months for localised swelling to go down from a sting.,I have an extremely mild allergy to bee stings that gets worse every time I am stung by a bee. (Three stings over my lifetime so far). It now takes a couple of months for localised swelling to go down from a sting.
2983,-0.234,Linux is an iterative learning process. You do smth over and over an learn or understand smth new every time.,Linux is an iterative learning process. You do smth over and over an learn or understand smth new every time.
1640,-0.229,Exhibiting lesser integrity or a lack of loyalty to important people in their lives.,Exhibiting lesser integrity or a lack of loyalty to important people in their lives.
2091,-0.229,"On time yachting across the to the Isle of Mann at night the skipper asked us about the vessel near us that had come up on the radar , there was a full moon as well as bioluminescence in the water and we were alone, skipper comes out yelling to adjust course, but stops when she sees that we're alone, then the depth meter drops down to 5 meters, and then the mysterious blip vanished, was probably a sub but was pretty freaky","On time yachting across the to the Isle of Mann at night the skipper asked us about the vessel near us that had come up on the radar , there was a full moon as well as bioluminescence in the water and we were alone, skipper comes out yelling to adjust course, but stops when she sees that we are alone, then the depth meter drops down to 5 meters, and then the mysterious blip vanished, was probably a sub but was pretty freaky"
2039,-0.208,"Yeah when that episode aired for the first time I legit thought vegeta was a goner and then the next episode was titled ""end of earth"" so I freaked out even more","Yeah when that episode aired for the first time I legit thought vegeta was a goner and then the next episode was titled ""end of earth"" so I freaked out even more"
4558,-0.17,Took them a while to get forced injections right,Took them a while to get forced injections right
1863,-0.167,"The question was about sex work in general (so escorts, porn stars, dommes etc...). i.e. people who have sex for money. If you're fine with watching porn and your respect porn stars, I don't see why it should be any different for FSSWs. \n\nIf you don't know them how can you judge them?\n\nWhy is it sad? We all have a need for intimacy (see the frequent questions about dating etc. on this sub or r/askwomen for proof of that). What is sad is that so many people are lonely in this world. It's not for everyone, but for some seeing a professional can be a real help.\n","The question was about sex work in general (so escorts, porn stars, dommes etc...). i.e. people who have sex for money. If you are fine with watching porn and your respect porn stars, I do not see why it should be any different for FSSWs. If you do not know them how can you judge them? Why is it sad? We all have a need for intimacy (see the frequent questions about dating etc. on this sub or r/askwomen for proof of that). What is sad is that so many people are lonely in this world. it is not for everyone, but for some seeing a professional can be a real help."


In [9]:
df[cols].sample(30).sort_values("bws").head(30)

Unnamed: 0,bws,text,text1
2103,-0.542,Went up the Willis Tower Skydeck and stood on the glass looking down over Chicago. Quite an experience!,Went up the Willis Tower Skydeck and stood on the glass looking down over Chicago. Quite an experience!
416,-0.362,"An egg allergy would prevent a flu vaccine? That's interesting, I never knew that.","An egg allergy would prevent a flu vaccine? that is interesting, I never knew that."
2179,-0.292,No but a sugar tax would help.,No but a sugar tax would help.
4803,-0.271,> If the USA can turn itself around \n\nIt's going to be a loooooooooooooooong two years. :/,> If the USA can turn itself around it is going to be a loooooooooooooooong two years.:/
1130,-0.271,"Id rather have the acknowledgment of a pest in front of me rather than having an unexpected disturbance which would cause a sudden reaction, eg started by a spider and spilling a drink or food item.","Id rather have the acknowledgment of a pest in front of me rather than having an unexpected disturbance which would cause a sudden reaction, eg started by a spider and spilling a drink or food item."
2700,-0.234,Try living in Myanmar... Oh my god the internet..,Try living in Myanmar... Oh my god the internet..
4802,-0.208,"This is most certainly true. But my reasoning is China might jump at this chance with whatever it can. But it is still only a might.\n\nIf the USA can turn itself around and get voters out there to actually vote, and start some major economic reforms in the next election. Then certainly it will keep it's status. \n\nBut as it stands now the ball is in Chinas court, and it is taking advantage while it can. So for now it's definitely a wait and see, and the next presidential election will be critical","This is most certainly true. But my reasoning is China might jump at this chance with whatever it can. But it is still only a might. If the USA can turn itself around and get voters out there to actually vote, and start some major economic reforms in the next election. Then certainly it will keep it is status. But as it stands now the ball is in Chinas court, and it is taking advantage while it can. So for now it is definitely a wait and see, and the next presidential election will be critical"
3219,-0.208,The numbers indicate how many mines are adjacent to the cell. The goal is to mark all of them without clicking on a cell hiding a mine.,The numbers indicate how many mines are adjacent to the cell. The goal is to mark all of them without clicking on a cell hiding a mine.
1946,-0.196,"I broke up with a high school girlfriend the month before Halloween, then that Halloween her costume was a jilted bride. She came to school in a wedding dress. We dated for 3 weeks.","I broke up with a high school girlfriend the month before Halloween, then that Halloween her costume was a jilted bride. She came to school in a wedding dress. We dated for 3 weeks."
4864,-0.167,My question is how can we hit the pockets of these companies in case net neutrality is defeated. We need to try our best to hit them where it'll hurt.,My question is how can we hit the pockets of these companies in case net neutrality is defeated. We need to try our best to hit them where it will hurt.


In [10]:
df[cols].sample(30).sort_values("bws").head(30)

Unnamed: 0,bws,text,text1
5102,-0.553,Thank you everyone for the discussions and for the 11 gild! It's now my 24th birthday!,Thank you everyone for the discussions and for the 11 gild! it is now my 24th birthday!
5327,-0.479,And that's exactly what Johnson did.,And that is exactly what Johnson did.
3727,-0.375,I'm not confident about this and it worries me,I am not confident about this and it worries me
4459,-0.333,at the same time the transmission is very nerfed compared to real life.,at the same time the transmission is very nerfed compared to real life.
2395,-0.312,"Watched a guy do it, then the driver panicked and crashed into the car.","Watched a guy do it, then the driver panicked and crashed into the car."
3102,-0.298,"No, option 40 is your bonus, especially as a GED.","No, option 40 is your bonus, especially as a GED."
948,-0.292,You realize that ALL domesticated animals are a result of this right.,You realize that ALL domesticated animals are a result of this right.
2910,-0.25,People who agree with you are down-voting you because they don't understand your point.,People who agree with you are down-voting you because they do not understand your point.
2866,-0.229,> I know that atheists who fight for what is good go to the best place of all in heaven because they never expected any reward.\n\nThis is why I am not concerned about my beliefs and only want to treat people well.,> I know that atheists who fight for what is good go to the best place of all in heaven because they never expected any reward. This is why I am not concerned about my beliefs and only want to treat people well.
2766,-0.208,I don't mind people wanting to keep yeezys or Jordan's nice because they're for showing off. Ultra Boosts are actually just running shoes and not wearing them outside or through the elements is like buying a Lamborghini only so you can show it off at car meets. It's up to you and your decision but there's a whole other world out there you could be experiencing,I do not mind people wanting to keep yeezys or Jordan's nice because they are for showing off. Ultra Boosts are actually just running shoes and not wearing them outside or through the elements is like buying a Lamborghini only so you can show it off at car meets. it is up to you and your decision but there is a whole other world out there you could be experiencing


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker", "text", "text1"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(2)
memory usage: 139.5+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

Wall time: 31.8 ms
