In [1]:
import os
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict
import scml

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [5]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
rows = []
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": getattr(t, "txt")})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5838 entries, 0 to 5837
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5838 non-null   float32
 1   text    5838 non-null   object 
dtypes: float32(1), object(1)
memory usage: 68.5+ KB
Wall time: 46 ms


In [6]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


In [7]:
df.sample(40).sort_values("bws").head(40)

Unnamed: 0,bws,text
2656,-0.596,What's the delay over?
1641,-0.435,It took some time but I'm no longer bitter toward her.
4669,-0.404,There's a term for it: Crank Magnetism.
3116,-0.333,As far as I can tell he just fell asleep in physics when they were explaining vectors and scalars.
751,-0.319,"A large reason as to why spiderbros are spiderbros is because a lot of them hang around where humans are and typically stay in just one spot where they chill and hunt for pray.\n\nAs far dragonflies, I am not so sure. AFAIK, they typically live in places with stagnant water, like a pond or a lake, so any place that's dry is out of their protection. Also, they buzz around from place to place, which can be annoying while you're just resting and watching a movie. So as to net benefit to humans, I think the dragonflies superiority to spiders is disputable, but as for abilities and beauty, I'd have to agree with you."
3065,-0.25,"A number of people have suggested boot media and a refresh.\n\nIf the boot media isn't handy, or you want to give yourself a bit of an extra challenge, edit your grub prompt to boot to `init=/bin/bash`. This will, rather than starting systemd to bring you your system, start bash. This has the effect of pretty much only requiring a working kernel, but also requires you to bring up anything you need (read-write access to your disks, any partitions other than /, networking) manually, yourself.\n\nAlso, ctl-C doesn't work, and there are no extra terminals (unless you make them), so don't run anything that might not terminate on its own."
1285,-0.208,"I don't have car. And I walked as much in Seoul as I do regularly in Montreal. I thought it might be all the walking but most of the time, I was sitting and stuffing my face with food and booze."
2931,-0.208,Thompson got a bunch of money to start and then a ton in the last few days! So our guy has plenty of money to spend. He needs phone banking!
614,-0.188,I used to do $40/hr contracts at prisons. Now those contracts go straight to the trash. Prisons are incredibly dangerous even with my personal escorts.
4048,-0.167,There's a lot of women working construction now compared to previous years. And there's no reason why not. There's nothing a woman can't do in construction that a man can't. I think alot of people in construction are shooting themselves in foot by not hiring women. They cut themselves of from half the talent pool.


# Severity rating label

In [8]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)
df[col].describe(percentiles=percentiles)

count    5838.000000
mean     2919.500000
std      1685.429767
min         1.000000
1%         59.370000
5%        292.850000
10%       584.700000
20%      1168.400000
30%      1752.100000
40%      2335.800000
50%      2919.500000
60%      3503.200000
70%      4086.900000
80%      4670.600000
90%      5254.300000
95%      5546.150000
99%      5779.630000
max      5838.000000
Name: label, dtype: float64

# Preprocess Text

In [9]:
def preprocess(row) -> str:
    res = row["text"]
    res = " ".join(res.split())
    return res

In [10]:
col = "text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|████████████████████████████████████████| 5838/5838 [00:00<00:00, 84575.08it/s]


# Embeddings

In [11]:
model = SentenceTransformer("pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2", device=device)
model.max_seq_length = 128
sentences = list(df["text"])

In [12]:
%%time
em = model.encode(sentences=sentences, batch_size=1000, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

em.shape=(5838, 384)
Wall time: 10.7 s


In [13]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
# avoid fragmented frame
df = df.copy()
del sentences

  self[col] = igetitem(value, i)


Wall time: 264 ms


In [14]:
cols = ["label", "bws"]
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5838 entries, 0 to 5837
Data columns (total 386 columns):
 #    Column  Non-Null Count  Dtype  
---   ------  --------------  -----  
 0    label   5838 non-null   int32  
 1    bws     5838 non-null   float32
 2    zz0000  5838 non-null   float32
 3    zz0001  5838 non-null   float32
 4    zz0002  5838 non-null   float32
 5    zz0003  5838 non-null   float32
 6    zz0004  5838 non-null   float32
 7    zz0005  5838 non-null   float32
 8    zz0006  5838 non-null   float32
 9    zz0007  5838 non-null   float32
 10   zz0008  5838 non-null   float32
 11   zz0009  5838 non-null   float32
 12   zz0010  5838 non-null   float32
 13   zz0011  5838 non-null   float32
 14   zz0012  5838 non-null   float32
 15   zz0013  5838 non-null   float32
 16   zz0014  5838 non-null   float32
 17   zz0015  5838 non-null   float32
 18   zz0016  5838 non-null   float32
 19   zz0017  5838 non-null   float32
 20   zz0018  5838 non-null   float32
 21   zz0019  5838

In [15]:
%%time
df[cols].to_parquet("output/ruddit.parquet", index=False)

Wall time: 330 ms
