In [1]:
import os
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict
import scml
import mylib

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [5]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 46.8 ms


In [6]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [7]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Preprocess Text

In [8]:
def preprocess(row) -> str:
    return mylib.preprocess(row["text"])


col = "text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|█████████████████████████████████████████| 5710/5710 [00:02<00:00, 2478.87it/s]


In [9]:
cols = ["bws", "text"]
df[cols].sample(40).sort_values("bws").head(40)

Unnamed: 0,bws,text
3268,-0.646,"It will be another weekend in no time, buddy. Time flies fast, so fill it with something you like doing."
4646,-0.532,Same happened to r/VaccinesCause
113,-0.5,"Same goes for humans. Also, we can climb trees"
4583,-0.438,Nurse here. Brilliant. My oncology patients are always at risk
4602,-0.438,"wait, you guys are getting paid?"
3766,-0.417,"You could delegate to a property manager, have nothing to worry about, collect a decent monthly income and down the road sell it."
3986,-0.417,"Refreshing seeing Nova Scotia on the front page for a good thing, last time was when Pizza Corner dissolved (Less we forget.)"
3142,-0.271,Let me know when k0nfig is IGLing.
1640,-0.229,Exhibiting lesser integrity or a lack of loyalty to important people in their lives.
1165,-0.208,This makes sense. However I would not want it to happen because the queue at the women's toilet is way longer than the line at the men's one.


# Character level features

In [10]:
%%time
col = "length"
df[col] = df["text"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 3 ms


In [11]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text"])

In [12]:
col = "digit_frac"
df[col] = df.progress_apply(digit_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 40342.69it/s]


In [13]:
col = "letter_frac"
df[col] = df.progress_apply(letter_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 39068.34it/s]


In [14]:
col = "space_frac"
df[col] = df.progress_apply(space_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 40978.50it/s]


In [15]:
col = "punc_frac"
df[col] = df.progress_apply(punc_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 35645.20it/s]


In [16]:
col = "upper_frac"
df[col] = df.progress_apply(upper_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 41667.13it/s]


# Embeddings

In [17]:
model = SentenceTransformer("pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2", device=device)
model.max_seq_length = 128
sentences = list(df["text"])

In [18]:
%%time
em = model.encode(sentences=sentences, batch_size=1000, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

em.shape=(5710, 384)
Wall time: 7.84 s


In [19]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
# avoid fragmented frame
df = df.copy()
del sentences

Wall time: 234 ms


  self[col] = igetitem(value, i)


# Review data

In [20]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker"]
char_fs = ["length", "digit_frac", "letter_frac", "space_frac", "punc_frac", "upper_frac"]
cols += char_fs
df[cols].describe(percentiles=percentiles)

Unnamed: 0,label,bws,worker,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac
count,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0
mean,2855.5,-0.027706,0.0,197.564098,0.003542,0.78883,0.177722,0.029905,0.030517
std,1648.47935,0.334195,0.0,172.016744,0.012983,0.034318,0.021473,0.02407,0.049035
min,1.0,-0.889,0.0,15.0,0.0,0.4,0.040541,0.0,0.0
1%,58.09,-0.667,0.0,24.0,0.0,0.672759,0.117647,0.0,0.0
5%,286.45,-0.521,0.0,33.0,0.0,0.733333,0.141414,0.0,0.005062
10%,571.9,-0.426,0.0,42.0,0.0,0.75,0.151515,0.010417,0.008333
20%,1142.8,-0.312,0.0,60.0,0.0,0.769841,0.161966,0.015385,0.011758
30%,1713.7,-0.213,0.0,82.0,0.0,0.779661,0.168627,0.018692,0.014599
40%,2284.6,-0.146,0.0,106.0,0.0,0.786537,0.173913,0.021652,0.017857


In [21]:
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 393 columns):
 #    Column       Non-Null Count  Dtype  
---   ------       --------------  -----  
 0    label        5710 non-null   int32  
 1    bws          5710 non-null   float32
 2    worker       5710 non-null   int8   
 3    length       5710 non-null   int16  
 4    digit_frac   5710 non-null   float32
 5    letter_frac  5710 non-null   float32
 6    space_frac   5710 non-null   float32
 7    punc_frac    5710 non-null   float32
 8    upper_frac   5710 non-null   float32
 9    zz0000       5710 non-null   float32
 10   zz0001       5710 non-null   float32
 11   zz0002       5710 non-null   float32
 12   zz0003       5710 non-null   float32
 13   zz0004       5710 non-null   float32
 14   zz0005       5710 non-null   float32
 15   zz0006       5710 non-null   float32
 16   zz0007       5710 non-null   float32
 17   zz0008       5710 non-null   float32
 18   zz0009       5710 non-null

In [22]:
%%time
df[cols].to_parquet("output/ruddit.parquet", index=False)

Wall time: 206 ms
