In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict
import scml
import mylib

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [5]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 45.7 ms


In [6]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [7]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Preprocess Text

In [8]:
def preprocess(row) -> str:
    return mylib.preprocess(row["text"])


col = "text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|█████████████████████████████████████████| 5710/5710 [00:02<00:00, 2434.99it/s]


In [9]:
cols = ["bws", "text"]
df[cols].sample(40).sort_values("bws").head(40)

Unnamed: 0,bws,text
5241,-0.646,I promise not to annoy you anymore.
1849,-0.562,"Yes, it is one the highest paying jobs per hour. People need to pay bills, eat, etcetera."
3541,-0.542,Expect Ceeday to come and ask for this
4058,-0.417,A lighter? May I ask why?
3447,-0.396,"I was joking. The loop is subtle, but the numbers make it obvious."
2763,-0.354,"Never felt those personally, but those are three out of a thousand"
1972,-0.333,Way too soon. At 14. In november I am turning 27 and my grandfather is not as bald as i am.
4542,-0.333,it seems they did not even bother to read the article
3226,-0.333,"Back when CollegeHumor was decent. Also, with the current trend in making movies out of board games, I am surprised this is not a real thing. Yet."
3265,-0.292,"Di tempatku lineage2 sekitar 2,05gb. Awal download kemarin sih sekitar 1,8."


# Character level features

In [10]:
%%time
col = "length"
df[col] = df["text"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 4.03 ms


In [11]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text"])

In [12]:
col = "digit_frac"
df[col] = df.progress_apply(digit_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 37986.88it/s]


In [13]:
col = "letter_frac"
df[col] = df.progress_apply(letter_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 38257.03it/s]


In [14]:
col = "space_frac"
df[col] = df.progress_apply(space_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 38971.83it/s]


In [15]:
col = "punc_frac"
df[col] = df.progress_apply(punc_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 37253.13it/s]


In [16]:
col = "upper_frac"
df[col] = df.progress_apply(upper_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 38834.83it/s]


# Textstat features

In [17]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text"])


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text"])

In [18]:
col = "flesch_reading_ease"
df[col] = df.progress_apply(flesch_reading_ease, axis=1)
df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████| 5710/5710 [00:00<00:00, 5752.24it/s]


In [19]:
col = "flesch_kincaid_grade"
df[col] = df.progress_apply(flesch_kincaid_grade, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 12404.20it/s]


In [20]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 18968.68it/s]


In [21]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 70244.60it/s]


In [22]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 40792.98it/s]


In [23]:
col = "gunning_fog"
df[col] = df.progress_apply(gunning_fog, axis=1)
df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████| 5710/5710 [00:00<00:00, 9989.67it/s]


In [24]:
col = "smog_index"
df[col] = df.progress_apply(smog_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 16324.35it/s]


In [25]:
col = "automated_readability_index"
df[col] = df.progress_apply(automated_readability_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 27368.39it/s]


In [26]:
col = "coleman_liau_index"
df[col] = df.progress_apply(coleman_liau_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 24217.12it/s]


In [27]:
col = "linsear_write_formula"
df[col] = df.progress_apply(linsear_write_formula, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 11915.70it/s]


In [28]:
col = "dale_chall_readability_score"
df[col] = df.progress_apply(dale_chall_readability_score, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 10275.83it/s]


# Detoxify labels

In [29]:
sentences = list(df["text"])
batch_size = 256
dtfy_fs = []

In [30]:
%%time
prefix = "dto_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint="pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt",
    device=device,
    batch_size=batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

Wall time: 1min 25s


4

In [31]:
%%time
prefix = "dtu_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint="pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
    device=device,
    batch_size=batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

Wall time: 1min 20s


4

In [32]:
%%time
prefix = "dtm_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint="pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt",
    device=device,
    batch_size=batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

Wall time: 1min 29s


0

In [33]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [34]:
model = SentenceTransformer("pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2", device=device)
model.max_seq_length = 128

In [35]:
%%time
em = model.encode(sentences=sentences, batch_size=1000, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

em.shape=(5710, 384)
Wall time: 6.4 s


In [36]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 234 ms


# Review data

In [37]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker"]
char_fs = ["length", "digit_frac", "letter_frac", "space_frac", "punc_frac", "upper_frac"]
textstat_fs = ["syllable_count", "lexicon_count", "sentence_count", "flesch_reading_ease", 
           "flesch_kincaid_grade", "gunning_fog", "smog_index", "automated_readability_index", 
           "coleman_liau_index", "linsear_write_formula", "dale_chall_readability_score"]
cols += char_fs + textstat_fs + dtfy_fs
df[cols].describe(percentiles=percentiles)

Unnamed: 0,label,bws,worker,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllable_count,lexicon_count,sentence_count,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit
count,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0
mean,2855.5,-0.027706,0.0,197.564098,0.003542,0.78883,0.177722,0.029905,0.030517,50.067426,36.371278,2.31979,75.253105,6.685254,8.998004,3.062995,7.482505,6.631683,8.562484,8.299163,0.177856,0.01308,0.113673,0.008411,0.059683,0.011454,0.195873,0.004698273,0.113027,0.015234,0.081492,0.01101,0.040167,0.203075,0.005742,0.105158,0.013474,0.083563,0.014735,0.044618
std,1648.47935,0.334195,0.0,172.016744,0.012983,0.034318,0.021473,0.02407,0.049035,44.035705,31.19425,1.722369,19.903004,4.430519,4.64589,4.695458,5.593331,3.883116,5.90646,2.270676,0.326334,0.052992,0.282462,0.056317,0.175381,0.064708,0.341383,0.02272458,0.285412,0.074354,0.20778,0.069685,0.149433,0.345334,0.027226,0.272925,0.075226,0.214901,0.078828,0.155314
min,1.0,-0.889,0.0,15.0,0.0,0.4,0.040541,0.0,0.0,3.0,2.0,1.0,-48.98,-2.5,0.8,0.0,-8.7,-10.16,0.0,0.2,0.000506,8e-05,0.000141,8.6e-05,0.000164,0.000121,0.000286,9.52907e-07,1.7e-05,5.2e-05,6.1e-05,1.2e-05,9e-06,0.000178,9e-06,6.3e-05,4.5e-05,9.5e-05,1.4e-05,1.2e-05
1%,58.09,-0.667,0.0,24.0,0.0,0.672759,0.117647,0.0,0.0,6.0,5.0,1.0,19.430301,-1.5,2.0,0.0,-3.0,-3.0037,1.5,0.35,0.000556,8.7e-05,0.000158,9.6e-05,0.000171,0.000134,0.00036,1.087282e-06,2.1e-05,6.3e-05,9.2e-05,1.5e-05,1.1e-05,0.000281,1.3e-05,9.5e-05,6.6e-05,0.000149,2.1e-05,1.7e-05
5%,286.45,-0.521,0.0,33.0,0.0,0.733333,0.141414,0.0,0.005062,8.0,6.0,1.0,41.869999,0.5,2.4,0.0,-0.2,0.32,2.0,5.62,0.000614,9.3e-05,0.000165,0.000103,0.000175,0.000138,0.00041,1.200621e-06,2.4e-05,7e-05,0.000103,1.7e-05,1.2e-05,0.000357,1.6e-05,0.000115,7.8e-05,0.000177,2.7e-05,2e-05
10%,571.9,-0.426,0.0,42.0,0.0,0.75,0.151515,0.010417,0.008333,10.0,8.0,1.0,50.669998,1.7,3.2,0.0,1.29,1.82,2.741667,6.24,0.000668,9.7e-05,0.00017,0.000108,0.000177,0.00014,0.000454,1.271093e-06,2.6e-05,7.6e-05,0.000111,1.9e-05,1.3e-05,0.000417,1.9e-05,0.000129,8.6e-05,0.0002,3.1e-05,2.2e-05
20%,1142.8,-0.312,0.0,60.0,0.0,0.769841,0.161966,0.015385,0.011758,15.0,11.0,1.0,59.98,3.1,4.87,0.0,3.2,3.698,4.0,6.93,0.000797,0.000103,0.000176,0.000115,0.000181,0.000145,0.000572,1.407266e-06,3.1e-05,8.9e-05,0.000133,2.3e-05,1.6e-05,0.000578,2.3e-05,0.000159,9.9e-05,0.000247,3.7e-05,2.6e-05
30%,1713.7,-0.213,0.0,82.0,0.0,0.779661,0.168627,0.018692,0.014599,20.0,15.0,1.0,66.739998,4.4,6.364,0.0,4.6,4.93,5.0,7.38,0.001006,0.000108,0.000183,0.000121,0.000188,0.000151,0.00085,1.645227e-06,4.1e-05,0.000113,0.000177,3e-05,2.1e-05,0.000857,2.7e-05,0.000201,0.000118,0.000327,4.6e-05,3.2e-05
40%,2284.6,-0.146,0.0,106.0,0.0,0.786537,0.173913,0.021652,0.017857,27.0,20.0,1.0,72.144003,5.3,8.0,0.0,5.8,5.9,6.0,7.82,0.001533,0.000114,0.000196,0.000127,0.000207,0.000165,0.001636,2.303226e-06,6.7e-05,0.000175,0.000311,4.8e-05,3.3e-05,0.001655,3.3e-05,0.000272,0.000152,0.000511,6.5e-05,4.5e-05


In [38]:
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 424 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    label                         5710 non-null   int32  
 1    bws                           5710 non-null   float32
 2    worker                        5710 non-null   int8   
 3    length                        5710 non-null   int16  
 4    digit_frac                    5710 non-null   float32
 5    letter_frac                   5710 non-null   float32
 6    space_frac                    5710 non-null   float32
 7    punc_frac                     5710 non-null   float32
 8    upper_frac                    5710 non-null   float32
 9    syllable_count                5710 non-null   int16  
 10   lexicon_count                 5710 non-null   int16  
 11   sentence_count                5710 non-null   int16  
 12   flesch_reading_ease           5710 non-null   

In [39]:
%%time
df[cols].to_parquet("output/ruddit.parquet", index=False)

Wall time: 223 ms
