In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 69.3 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [6]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Stage 1: Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)

text1


100%|██████████████████████████████████████████| 5710/5710 [00:06<00:00, 913.09it/s]


text2


100%|██████████████████████████████████████████| 5710/5710 [00:47<00:00, 120.71it/s]


In [8]:
cols = ["bws", "text", "text1", "text2"]
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
2399,-0.667,"I kinda want to make a space marine costume, but that’ll be difficult to do","I kinda want to make a space marine costume, but that'll be difficult to do","I kind of want to make a space marine costume, but that will be difficult to do"
1580,-0.522,"Agreed go no contact, get a lawyer, hit the gym, and find a hobby.","Agreed go no contact, get a lawyer, hit the gym, and find a hobby.","Agreed go no contact, get a lawyer, hit the gym, and find a hobby."
3749,-0.458,"Very wise decision , you would probably get 35-50 K in dividends per year. Either you can reinvent it , travel or go part time at work. Either way selling and investing it in market would be great . Sell and and wait couple years for recession/ market crash. You’ll get great value of stocks and a higher dividend. Enjoy early retirement if that’s what your looking for !","Very wise decision , you would probably get 35-50 K in dividends per year. Either you can reinvent it , travel or go part time at work. Either way selling and investing it in market would be great . Sell and and wait couple years for recession/ market crash. You'll get great value of stocks and a higher dividend. Enjoy early retirement if that's what your looking for !","Very wise decision , you would probably get 35-50 K in dividends per year. Either you can reinvent it , travel or go part time at work. Either way selling and investing it in market would be great . Sell and and wait couple years for recession/ market crash. you will get great value of stocks and a higher dividend. Enjoy early retirement if that is what your looking for !"
729,-0.396,"I agree with your assessment, but I think you are mistaken about spiderbro.\n\nSpecifically, a big reason that sub is popular is that siders are everywhere and mostly, easy to photograph. I do not think it's indicative of any relative arthropod preferences.","I agree with your assessment, but I think you are mistaken about spiderbro. Specifically, a big reason that sub is popular is that siders are everywhere and mostly, easy to photograph. I do not think it's indicative of any relative arthropod preferences.","I agree with your assessment, but I think you are mistaken about spiderbro. Specifically, a big reason that sub is popular is that siders are everywhere and mostly, easy to photograph. I do not think it is indicative of any relative arthropod preferences."
2488,-0.239,"You know, I wouldn't be suprised if Sanji cut ties with his family because they kept fighting over girls.","You know, I wouldn't be suprised if Sanji cut ties with his family because they kept fighting over girls.","You know, I would not be suprised if Sanji cut ties with his family because they kept fighting over girls."
3139,-0.174,"With that logic Niko wasn't a good player in mouz because he didn't win a lot.\n\nWinning is about team performance. If your team isn't good you will still lose more than you win even if you are one of the most skilled players. You can be an insane player and lose.\n\nWinning isn't an issue but it still means you are very likely to die less, have less ecos where yout chances of fragging are low, get more eco kills etc. Having good stats in a game you win, often even very clearly, is going to be a lot easier than when you are losing.","With that logic Niko wasn't a good player in mouz because he didn't win a lot. Winning is about team performance. If your team isn't good you will still lose more than you win even if you are one of the most skilled players. You can be an insane player and lose. Winning isn't an issue but it still means you are very likely to die less, have less ecos where yout chances of fragging are low, get more eco kills etc. Having good stats in a game you win, often even very clearly, is going to be a lot easier than when you are losing.","With that logic Niko was not a good player in mouz because he did not win a lot. Winning is about team performance. If your team is not good you will still lose more than you win even if you are one of the most skilled players. You can be an insane player and lose. Winning is not an issue but it still means you are very likely to die less, have less ecos where yout chances of fragging are low, get more eco kills etc. Having good stats in a game you win, often even very clearly, is going to be a lot easier than when you are losing."
1249,-0.167,"Do you choose the food you like? Do you choose the desire to eat such foods? Do you choose the knowledge you have to combat these desires? Do you choose the desire to get up and go to the gym? \n\nNothing is a choice. Sometimes you get provoked into doing one thing or another. What we need to do is set up an incentive system prodding people into happy, healthy lives.","Do you choose the food you like? Do you choose the desire to eat such foods? Do you choose the knowledge you have to combat these desires? Do you choose the desire to get up and go to the gym? Nothing is a choice. Sometimes you get provoked into doing one thing or another. What we need to do is set up an incentive system prodding people into happy, healthy lives.","Do you choose the food you like? Do you choose the desire to eat such foods? Do you choose the knowledge you have to combat these desires? Do you choose the desire to get up and go to the gym? Nothing is a choice. Sometimes you get provoked into doing one thing or another. What we need to do is set up an incentive system prodding people into happy, healthy lives."
2182,-0.125,"I would beg to differ. He had been expelled for almost a year. He planned and waited. That is pretty adult thinking. Not emotional and spur of the moment, you know, like a child.\n\nI think that the law will see him as an adult as well.","I would beg to differ. He had been expelled for almost a year. He planned and waited. That is pretty adult thinking. Not emotional and spur of the moment, you know, like a child. I think that the law will see him as an adult as well.","I would beg to differ. He had been expelled for almost a year. He planned and waited. That is pretty adult thinking. Not emotional and spur of the moment, you know, like a child. I think that the law will see him as an adult as well."
808,-0.062,"It's just not that deep. Some people are insecure and immature. There is nothing mystical or mysterious at work here, it's kids yelling lame insults.","It's just not that deep. Some people are insecure and immature. There is nothing mystical or mysterious at work here, it's kids yelling lame insults.","it is just not that deep. Some people are insecure and immature. There is nothing mystical or mysterious at work here, it is kids yelling lame insults."
3814,-0.062,"The trick is kegals exercise. When you feel the baby trying to get out, just clinch those muscles and they'll stay in for another 10 years or so of gestation.","The trick is kegals exercise. When you feel the baby trying to get out, just clinch those muscles and they'll stay in for another 10 years or so of gestation.","The trick is kegals exercise. When you feel the baby trying to get out, just clinch those muscles and they will stay in for another 10 years or so of gestation."


In [9]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
2387,-0.708,Both at the same time,Both at the same time,Both at the same time
3958,-0.542,..Scotland is giving out free toilet paper and soap? Awesome!,..Scotland is giving out free toilet paper and soap? Awesome!,..Scotland is giving out free toilet paper and soap? Awesome!
896,-0.489,Whipped cream is cooler since it’s in the fridge. It also blends with the chocolate a lot more since it’s a really light foam to begin with. It’s taste goes along with hot chocolate better IMO.,Whipped cream is cooler since it's in the fridge. It also blends with the chocolate a lot more since it's a really light foam to begin with. It's taste goes along with hot chocolate better IMO.,Whipped cream is cooler since it is in the fridge. It also blends with the chocolate a lot more since it is a really light foam to begin with. it is taste goes along with hot chocolate better IMO.
288,-0.391,"There's not much really keeping a sports team in a specific city, so if one city decides not to give incentive, they'll just move to the next city that does. The cities are eager to keep them around for the tourist exposure as well as the pride/bragging rights (if there's an overall net gain or not, I'd be curious for links to some studies).\n\nConceptually, I totally agree, especially when it's at the expense of public parks, but sadly it won't mean anything unless everyone agrees not to give incentives.","There's not much really keeping a sports team in a specific city, so if one city decides not to give incentive, they'll just move to the next city that does. The cities are eager to keep them around for the tourist exposure as well as the pride/bragging rights (if there's an overall net gain or not, I'd be curious for links to some studies). Conceptually, I totally agree, especially when it's at the expense of public parks, but sadly it won't mean anything unless everyone agrees not to give incentives.","There is not much really keeping a sports team in a specific city, so if one city decides not to give incentive, they will just move to the next city that does. The cities are eager to keep them around for the tourist exposure as well as the pride/bragging rights (if there is an overall net gain or not, I would be curious for links to some studies). Conceptually, I totally agree, especially when it is at the expense of public parks, but sadly it will not mean anything unless everyone agrees not to give incentives."
2030,-0.354,The opportunity to hit the reset button.,The opportunity to hit the reset button.,The opportunity to hit the reset button.
3844,-0.333,"Yours is the Girl Scout Pledge. What u/hagnonbg wrote was the Girl Scout Law. 2 different things, both correct.\n\nand the something something is ""to help people at all times""","Yours is the Girl Scout Pledge. What u/hagnonbg wrote was the Girl Scout Law. 2 different things, both correct. and the something something is ""to help people at all times""","Yours is the Girl Scout Pledge. What u/hagnonbg wrote was the Girl Scout Law. 2 different things, both correct. and the something something is ""to help people at all times"""
5566,-0.312,Just wait til they remove pot from Schedule 1...all the pharmaceuticals will undercut the dispensaries and jack prices back up. Capitalism isn't always the answer.,Just wait til they remove pot from Schedule 1...all the pharmaceuticals will undercut the dispensaries and jack prices back up. Capitalism isn't always the answer.,Just wait til they remove pot from Schedule 1...all the pharmaceuticals will undercut the dispensaries and jack prices back up. Capitalism is not always the answer.
226,-0.271,"Do most families have weird power dynamics going on? In my experiences, all of my family members were on equal footing.","Do most families have weird power dynamics going on? In my experiences, all of my family members were on equal footing.","Do most families have weird power dynamics going on? In my experiences, all of my family members were on equal footing."
2646,-0.271,well unfortunately they can do anything. they have all of the power,well unfortunately they can do anything. they have all of the power,well unfortunately they can do anything. they have all of the power
3238,-0.239,Risky click of the day,Risky click of the day,Risky click of the day


In [10]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
3502,-0.333,Wow! Congratulations on finding a loving respectful relationship! That can be a rare thing in someone’s life. Embrace it. There is no such thing as a normal role and even if there was and you’re comfortable with how your relationship is working then F normal! Good on you.,Wow! Congratulations on finding a loving respectful relationship! That can be a rare thing in someone's life. Embrace it. There is no such thing as a normal role and even if there was and you're comfortable with how your relationship is working then F normal! Good on you.,Wow! Congratulations on finding a loving respectful relationship! That can be a rare thing in someone is life. Embrace it. There is no such thing as a normal role and even if there was and you are comfortable with how your relationship is working then F normal! Good on you.
4197,-0.333,Not in Colorado you're not!,Not in Colorado you're not!,Not in Colorado you are not!
2133,-0.25,Simpsons Hit & Run\n\nStill the best GTA game. Fight me.,Simpsons Hit & Run Still the best GTA game. Fight me.,Simpsons Hit & Run Still the best GTA game. Fight me.
2490,-0.167,"I personally think it won't be revealed until she falls in the water and Pedro says something like ""don't worry she's an excellent swimmer."" ""...**bubble bubble**..."" ""Umm...""","I personally think it won't be revealed until she falls in the water and Pedro says something like ""don't worry she's an excellent swimmer."" ...""**bubble bubble**..."" ""Umm...""","I personally think it will not be revealed until she falls in the water and Pedro says something like ""do not worry she is an excellent swimmer."" ...""**bubble bubble**..."" ""Umm..."""
5136,-0.167,Apples literally the only company I remotely trust with privacy. \n\nExact opposite about google.,Apples literally the only company I remotely trust with privacy. Exact opposite about google.,Apples literally the only company I remotely trust with privacy. Exact opposite about google.
2435,-0.104,This is hilarious and sad. As if I dont understand more about womens struggles and the history behind them from my GWS classes....,This is hilarious and sad. As if I dont understand more about womens struggles and the history behind them from my GWS classes....,This is hilarious and sad. As if I dont understand more about womens struggles and the history behind them from my GWS classes....
2904,-0.085,It's weird. I just saw a post on the front page the other day how there were like 1.2 million concurrent players and I'd say 90% of the time the public events are completely empty.,It's weird. I just saw a post on the front page the other day how there were like 1.2 million concurrent players and I'd say 90% of the time the public events are completely empty.,it is weird. I just saw a post on the front page the other day how there were like 1.2 million concurrent players and I would say 90% of the time the public events are completely empty.
3070,-0.064,You can hear Gordon Keith every morning (M-F) at 8:40 am CST doing some sort of wild ass impersonation on 1310AM/96.7 FM . (DFW area)\n\n,You can hear Gordon Keith every morning (M-F) at 8:40 am CST doing some sort of wild ass impersonation on 1310AM/96.7 FM . (DFW area),You can hear Gordon Keith every morning (M-F) at 8:40 am CST doing some sort of wild ass impersonation on 1310AM/96.7 FM . ([psyched to chill and hang out with friends] area)
3122,-0.042,2kg of pine nuts! Jesus you would have to be rich,2kg of pine nuts! Jesus you would have to be rich,2kg of pine nuts! Jesus you would have to be rich
237,-0.042,It is not a victimless crime. The victim is the potential child who has to live with disorders.,It is not a victimless crime. The victim is the potential child who has to live with disorders.,It is not a victimless crime. The victim is the potential child who has to live with disorders.


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker", "text", "text1", "text2"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(3)
memory usage: 184.1+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

Wall time: 136 ms
