In [1]:
import os
import gc
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [4]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 51 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [6]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Stage 1: Preprocess Text
Speed dropped from 2400 to 600 it/s

In [7]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)

text1


100%|██████████████████████████████████████████| 5710/5710 [00:09<00:00, 603.26it/s]


text2


100%|██████████████████████████████████████████| 5710/5710 [00:50<00:00, 113.73it/s]


In [8]:
cols = ["bws", "text", "text1", "text2"]
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
5171,-0.532,Thats what they WANT you to think.,Thats what they WANT you to think.,Thats what they WANT you to think.
3349,-0.521,Call me old fashioned. Usually when I feel like saying “hopefully definitely” I just shorten it to “probably.”,"Call me old fashioned. Usually when I feel like saying ""hopefully definitely"" I just shorten it to ""probably.""","Call me old fashioned. Usually when I feel like saying ""hopefully definitely"" I just shorten it to ""probably."""
3050,-0.438,I am crazy and don't consider 0 a number.,I am crazy and don't consider 0 a number.,I am crazy and do not consider 0 a number.
1892,-0.229,"When I was in college I spent around 4-5 hours baking a 2-layer chocolate cake with dark chocolate ganache, and another 4ish hours hand-drawing a card for a girl's birthday I was friends with and had a massive crush on. She didn't feel the same way, broke my heart, ghosted me and now she is married. Oh well.","When I was in college I spent around 4-5 hours baking a 2-layer chocolate cake with dark chocolate ganache, and another 4ish hours hand-drawing a card for a girl's birthday I was friends with and had a massive crush on. She didn't feel the same way, broke my heart, ghosted me and now she is married. Oh well.","When I was in college I spent around 4-5 hours baking a 2-layer chocolate cake with dark chocolate ganache, and another 4ish hours hand-drawing a card for a girl's birthday I was friends with and had a massive crush on. She did not feel the same way, broke my heart, ghosted me and now she is married. Oh well."
2994,-0.229,"Sorry about the community. Linux tends to have some very pedantic people. I think some just like to ""validate"" their own intelligence by correcting the most trivial of things.\n\nAnyway, it is a milestone and you are well on your way. You'll find things are just ""different"" but when re-learning how to be an advanced user, you'll find it is much more rewarding, in my opinion.","Sorry about the community. Linux tends to have some very pedantic people. I think some just like to ""validate"" their own intelligence by correcting the most trivial of things. Anyway, it is a milestone and you are well on your way. You'll find things are just ""different"" but when re-learning how to be an advanced user, you'll find it is much more rewarding, in my opinion.","Sorry about the community. Linux tends to have some very pedantic people. I think some just like to ""validate"" their own intelligence by correcting the most trivial of things. Anyway, it is a milestone and you are well on your way. you will find things are just ""different"" but when re-learning how to be an advanced user, you will find it is much more rewarding, in my opinion."
3523,-0.208,do you not watch playoff basketball? he put out a historical run dragging yet another lottery team to the finals..,do you not watch playoff basketball? he put out a historical run dragging yet another lottery team to the finals..,do you not watch playoff basketball? he put out a historical run dragging yet another lottery team to the finals..
5301,-0.188,"Beautiful.\n\nI am a leftist capitalist as well. I feel as if everyone human should be provided the basic needs of shelter, food, water, and entertainment and the progressiveness and merit-seeking drive of the human soul will fill in the rest of the gaps when they don't have to worry about their basic foundation.","Beautiful. I am a leftist capitalist as well. I feel as if everyone human should be provided the basic needs of shelter, food, water, and entertainment and the progressiveness and merit-seeking drive of the human soul will fill in the rest of the gaps when they don't have to worry about their basic foundation.","Beautiful. I am a leftist capitalist as well. I feel as if everyone human should be provided the basic needs of shelter, food, water, and entertainment and the progressiveness and merit-seeking drive of the human soul will fill in the rest of the gaps when they do not have to worry about their basic foundation."
1213,-0.106,None of what you said indicates your conclusion that it isn’t a choice though. You merely made it clear that it is a harder choice to make for those who have already been obese. All the more reason to not get there in the first place.,None of what you said indicates your conclusion that it isn't a choice though. You merely made it clear that it is a harder choice to make for those who have already been obese. All the more reason to not get there in the first place.,None of what you said indicates your conclusion that it is not a choice though. You merely made it clear that it is a harder choice to make for those who have already been obese. All the more reason to not get there in the first place.
1100,-0.021,"I honestly think it's just a lack of relations and an easy place to meet to meet other like minded people. I think the problem with psychology is a catch al test and liability issues that prevent the psychologist from being able to make his or her personal opinion. If the boss can listen to conversations and passed patients can be audited, a human will try to make the most ""Empathetic"" choice to Thier patients, regardless of the tough love they think the patient needs.","I honestly think it's just a lack of relations and an easy place to meet to meet other like minded people. I think the problem with psychology is a catch al test and liability issues that prevent the psychologist from being able to make his or her personal opinion. If the boss can listen to conversations and passed patients can be audited, a human will try to make the most ""Empathetic"" choice to Thier patients, regardless of the tough love they think the patient needs.","I honestly think it is just a lack of relations and an easy place to meet to meet other like minded people. I think the problem with psychology is a catch al test and liability issues that prevent the psychologist from being able to make his or her personal opinion. If the boss can listen to conversations and passed patients can be audited, a human will try to make the most ""Empathetic"" choice to Thier patients, regardless of the tough love they think the patient needs."
3363,-0.021,He's just glad he didn't get shot.,He's just glad he didn't get shot.,he is just glad he did not get shot.


In [9]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
1462,-0.521,"Whether it's a disability, loss of job, or poor spending habits, I'm terrified of not being able to provide for my wife and future family.","Whether it's a disability, loss of job, or poor spending habits, I'm terrified of not being able to provide for my wife and future family.","Whether it is a disability, loss of job, or poor spending habits, I am terrified of not being able to provide for my wife and future family."
3969,-0.417,That's how the world works. It's a sine wave.,That's how the world works. It's a sine wave.,that is how the world works. it is a sine wave.
4145,-0.417,180 hours is 7.5 days. This has to be a mistake,180 hours is 7.5 days. This has to be a mistake,180 hours is 7.5 days. This has to be a mistake
560,-0.375,Does it actually cost the NFL money though? I'm legitimately asking. I haven't seen any analytics that can attribute a loss of revenue to kneeling.,Does it actually cost the NFL money though? I'm legitimately asking. I haven't seen any analytics that can attribute a loss of revenue to kneeling.,Does it actually cost the NFL money though? I am legitimately asking. I have not seen any analytics that can attribute a loss of revenue to kneeling.
3003,-0.312,I got qualified for the army yesterday but decided not to enlist because the mos I want (15t) wasn't available. My recruiter was alright with this decision but a few of the other recruiters talked to me and said I was making a mistake and that I should instead take a job I don't really want. They told me it could be years until the job is open and that it's not worth it. What do you guys think about this? Is it a big mistake and could it really be that long until the job opens up?,I got qualified for the army yesterday but decided not to enlist because the mos I want (15t) wasn't available. My recruiter was alright with this decision but a few of the other recruiters talked to me and said I was making a mistake and that I should instead take a job I don't really want. They told me it could be years until the job is open and that it's not worth it. What do you guys think about this? Is it a big mistake and could it really be that long until the job opens up?,I got qualified for the army yesterday but decided not to enlist because the mos I want (15t) was not available. My recruiter was alright with this decision but a few of the other recruiters talked to me and said I was making a mistake and that I should instead take a job I do not really want. They told me it could be years until the job is open and that it is not worth it. What do you guys think about this? Is it a big mistake and could it really be that long until the job opens up?
1209,-0.25,"I work in the charter school industry. Schools receive their charter from an authorizer. They are all non-profit. For-profit schools can’t get authorized as charter schools, at least in my state.","I work in the charter school industry. Schools receive their charter from an authorizer. They are all non-profit. For-profit schools can't get authorized as charter schools, at least in my state.","I work in the charter school industry. Schools receive their charter from an authorizer. They are all non-profit. For-profit schools cannot get authorized as charter schools, at least in my state."
1419,-0.208,"They left for a good reason. Just not a ""at fault"" reason, or they couldn't prove the ""at fault"" reason in a court room.","They left for a good reason. Just not a ""at fault"" reason, or they couldn't prove the ""at fault"" reason in a court room.","They left for a good reason. Just not a ""at fault"" reason, or they could not prove the ""at fault"" reason in a court room."
1584,-0.208,"Don't worry, I take these things pretty seriously. I won't shame /u/quartilius for their choices, but neither will I play Izanagi to their Izanami. 😜","Don't worry, I take these things pretty seriously. I won't shame /u/quartilius for their choices, but neither will I play Izanagi to their Izanami. (winking face with tongue)","do not worry, I take these things pretty seriously. I will not shame /u/quartilius for their choices, but neither will I play Izanagi to their Izanami. (winking face with tongue)"
2069,-0.021,"When people are racist, homophobic, xenophobic etc. I hate any form of discrimination. \n\n\nI hate what separates us, but I love what makes us different.","When people are racist, homophobic, xenophobic etc. I hate any form of discrimination. I hate what separates us, but I love what makes us different.","When people are racist, homophobic, xenophobic etc. I hate any form of discrimination. I hate what separates us, but I love what makes us different."
1680,-0.021,"I've never had a one night stand, but I'm not sure I'd want to. I prefer sex with people I've got a vested interest in.\n\nThe closest to a ONS I've had involved a weekend with this woman that I turned out to have no interest in, and that cemented my disinterest in further casual hook-ups.","I've never had a one night stand, but I'm not sure I'd want to. I prefer sex with people I've got a vested interest in. The closest to a ONS I've had involved a weekend with this woman that I turned out to have no interest in, and that cemented my disinterest in further casual hook-ups.","I have never had a one night stand, but I am not sure I would want to. I prefer sex with people I have got a vested interest in. The closest to a ONS I have had involved a weekend with this woman that I turned out to have no interest in, and that cemented my disinterest in further casual hook-ups."


In [10]:
df[cols].sample(20).sort_values("bws").head(20)

Unnamed: 0,bws,text,text1,text2
3663,-0.604,That’s Awesome!! I got 6:41 today on the strider and thought I was jamming.,That's Awesome!! I got 6:41 today on the strider and thought I was jamming.,that is Awesome!! I got 6:41 today on the strider and thought I was jamming.
2271,-0.375,"Haha, luckily it doesn’t happen often as I work nights. But when I do have to get up early, I get paranoid and set a few alarms lol","Haha, luckily it doesn't happen often as I work nights. But when I do have to get up early, I get paranoid and set a few alarms lol","Haha, luckily it does not happen often as I work nights. But when I do have to get up early, I get paranoid and set a few alarms [laughing out loud]"
1945,-0.312,When you’re trying to play pinball on the computer and you’re about to hit your high score and someone calls the house phone.,When you're trying to play pinball on the computer and you're about to hit your high score and someone calls the house phone.,When you are trying to play pinball on the computer and you are about to hit your high score and someone calls the house phone.
2068,-0.298,The 'I am always right' customers,The 'I am always right' customers,The 'I am always right' customers
4805,-0.271,You know the world is a strange place when the West is calling for more censorship and surveillance and China is calling for less of it.,You know the world is a strange place when the West is calling for more censorship and surveillance and China is calling for less of it.,You know the world is a strange place when the West is calling for more censorship and surveillance and China is calling for less of it.
4802,-0.208,"This is most certainly true. But my reasoning is China might jump at this chance with whatever it can. But it is still only a might.\n\nIf the USA can turn itself around and get voters out there to actually vote, and start some major economic reforms in the next election. Then certainly it will keep it's status. \n\nBut as it stands now the ball is in Chinas court, and it is taking advantage while it can. So for now it's definitely a wait and see, and the next presidential election will be critical","This is most certainly true. But my reasoning is China might jump at this chance with whatever it can. But it is still only a might. If the USA can turn itself around and get voters out there to actually vote, and start some major economic reforms in the next election. Then certainly it will keep it's status. But as it stands now the ball is in Chinas court, and it is taking advantage while it can. So for now it's definitely a wait and see, and the next presidential election will be critical","This is most certainly true. But my reasoning is China might jump at this chance with whatever it can. But it is still only a might. If the USA can turn itself around and get voters out there to actually vote, and start some major economic reforms in the next election. Then certainly it will keep it is status. But as it stands now the ball is in Chinas court, and it is taking advantage while it can. So for now it is definitely a wait and see, and the next presidential election will be critical"
3864,-0.167,"Regardless of whether Trump wins, I make a vow that, if Mr. Gu should contact me, I will personally fund his research to as much as he needs.","Regardless of whether Trump wins, I make a vow that, if Mr. Gu should contact me, I will personally fund his research to as much as he needs.","Regardless of whether Trump wins, I make a vow that, if Mr. Gu should contact me, I will personally fund his research to as much as he needs."
4584,-0.156,Is a room full of nurses really the place this message needs to be spread?,Is a room full of nurses really the place this message needs to be spread?,Is a room full of nurses really the place this message needs to be spread?
140,-0.146,"You can, so long as those people are not a member of a protected class. Religion and race are both protected classes, so you can not discriminate based on those facts.","You can, so long as those people are not a member of a protected class. Religion and race are both protected classes, so you can not discriminate based on those facts.","You can, so long as those people are not a member of a protected class. Religion and race are both protected classes, so you can not discriminate based on those facts."
2398,-0.083,"Once I was the one to tell the taxi driver to follow a car, My bff took my car to come pick me up (from work, my car was left at her place last time don't ask why) but didn't see me where I sayed I'd be because I was late by literly 3 minutes, and drove off. I saw her so I hailed a taxi and got them to follow the car. She went back to my house, and called me. I explained to the taxi driver and they got a real kick out of it! That's why you don't leave your car at a friend's place. Ever.","Once I was the one to tell the taxi driver to follow a car, My bff took my car to come pick me up (from work, my car was left at her place last time don't ask why) but didn't see me where I sayed I'd be because I was late by literly 3 minutes, and drove off. I saw her so I hailed a taxi and got them to follow the car. She went back to my house, and called me. I explained to the taxi driver and they got a real kick out of it! That's why you don't leave your car at a friend's place. Ever.","Once I was the one to tell the taxi driver to follow a car, My [best friends forever] took my car to come pick me up (from work, my car was left at her place last time do not ask why) but did not see me where I sayed I would be because I was late by literly 3 minutes, and drove off. I saw her so I hailed a taxi and got them to follow the car. She went back to my house, and called me. I explained to the taxi driver and they got a real kick out of it! that is why you do not leave your car at a friend's place. Ever."


# Review data

In [11]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker", "text", "text1", "text2"]
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(3)
memory usage: 184.1+ KB


In [12]:
%%time
df[cols].to_parquet("output/pre_ruddit.parquet", index=False)

Wall time: 39 ms
