In [1]:
import os
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
from tqdm import tqdm
from typing import Dict
import scml
import mylib

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [5]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 62.4 ms


In [6]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


In [7]:
df.sample(40).sort_values("bws").head(40)

Unnamed: 0,bws,text
3544,-0.875,So so awesome. Really want to see more.
1801,-0.771,I'd almost forgotten that one. What a gem!
3881,-0.688,It's just good politics
3857,-0.646,"I've no idea what's happening and what a ""girl scout"" is, but this is nice."
1568,-0.438,None of my friends throw a New Year's eve party. You're lucky!!!
718,-0.417,Lol not gonna try and change your view on this one. You hit the nail on the head. Spot on.
168,-0.413,I was talking about travel. If you don't leave your home you can't travel.
3069,-0.396,"Not only that, but he's clearly stifling laughter."
3818,-0.354,I was born 2 weeks late. Another 622 weeks and they say I would have been born a teen.
3673,-0.34,"The strider distance is based on turns of the wheel, so on benchmark days move your gears to the low end of your normal range to get your best distance. HTH!"


# Severity rating label

In [8]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)
df[col].describe(percentiles=percentiles)

count    5710.00000
mean     2855.50000
std      1648.47935
min         1.00000
1%         58.09000
5%        286.45000
10%       571.90000
20%      1142.80000
30%      1713.70000
40%      2284.60000
50%      2855.50000
60%      3426.40000
70%      3997.30000
80%      4568.20000
90%      5139.10000
95%      5424.55000
99%      5652.91000
max      5710.00000
Name: label, dtype: float64

# Preprocess Text

In [9]:
def preprocess(row) -> str:
    return mylib.preprocess(row["text"])


col = "text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|█████████████████████████████████████████| 5710/5710 [00:02<00:00, 2417.76it/s]


# Embeddings

In [10]:
model = SentenceTransformer("pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2", device=device)
model.max_seq_length = 128
sentences = list(df["text"])

In [11]:
%%time
em = model.encode(sentences=sentences, batch_size=1000, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

em.shape=(5710, 384)
Wall time: 10.7 s


In [12]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
# avoid fragmented frame
df = df.copy()
del sentences

Wall time: 244 ms


  self[col] = igetitem(value, i)


In [13]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker"]
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 387 columns):
 #    Column  Non-Null Count  Dtype  
---   ------  --------------  -----  
 0    label   5710 non-null   int32  
 1    bws     5710 non-null   float32
 2    worker  5710 non-null   int8   
 3    zz0000  5710 non-null   float32
 4    zz0001  5710 non-null   float32
 5    zz0002  5710 non-null   float32
 6    zz0003  5710 non-null   float32
 7    zz0004  5710 non-null   float32
 8    zz0005  5710 non-null   float32
 9    zz0006  5710 non-null   float32
 10   zz0007  5710 non-null   float32
 11   zz0008  5710 non-null   float32
 12   zz0009  5710 non-null   float32
 13   zz0010  5710 non-null   float32
 14   zz0011  5710 non-null   float32
 15   zz0012  5710 non-null   float32
 16   zz0013  5710 non-null   float32
 17   zz0014  5710 non-null   float32
 18   zz0015  5710 non-null   float32
 19   zz0016  5710 non-null   float32
 20   zz0017  5710 non-null   float32
 21   zz0018  5710

In [14]:
%%time
df[cols].to_parquet("output/ruddit.parquet", index=False)

Wall time: 293 ms
