In [1]:
import os
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
from tqdm import tqdm
import scml

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_csv("input/js18/train.csv", engine="c", low_memory=False)
df.drop(columns=["id"], inplace=True)
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
df[cols] = df[cols].astype(np.int8) 
df["label_sum"] = df["toxic"] + df["severe_toxic"] + df["obscene"] + df["threat"] + df["insult"] + df["identity_hate"]
df.drop(index=df[df.label_sum == 0].index, inplace=True)
df.drop(columns=["label_sum"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16225 entries, 6 to 159554
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   comment_text   16225 non-null  object
 1   toxic          16225 non-null  int8  
 2   severe_toxic   16225 non-null  int8  
 3   obscene        16225 non-null  int8  
 4   threat         16225 non-null  int8  
 5   insult         16225 non-null  int8  
 6   identity_hate  16225 non-null  int8  
dtypes: int8(6), object(1)
memory usage: 348.6+ KB
Wall time: 967 ms


In [5]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
comment_text,0,0.0,object
toxic,0,0.0,int8
severe_toxic,0,0.0,int8
obscene,0,0.0,int8
threat,0,0.0,int8
insult,0,0.0,int8
identity_hate,0,0.0,int8


In [6]:
df.head()

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,"Hey... what is it..\n@ | talk .\nWhat is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?\n\nAsk Sityush to clean up his behavior than issue me nonsensical warnings...",1,0,0,0,0,0
16,"Bye! \n\nDon't look, come or think of comming back! Tosser.",1,0,0,0,0,0
42,"You are gay or antisemmitian? \n\nArchangel WHite Tiger\n\nMeow! Greetingshhh!\n\nUh, there are two ways, why you do erased my comment about WW2, that holocaust was brutally slaying of Jews and not gays/Gypsys/Slavs/anyone...\n\n1 - If you are anti-semitian, than shave your head bald and go to the skinhead meetings!\n\n2 - If you doubt words of the Bible, that homosexuality is a deadly sin, make a pentagram tatoo on your forehead go to the satanistic masses with your gay pals!\n\n3 - First and last warning, you fucking gay - I won't appreciate if any more nazi shwain would write in my page! I don't wish to talk to you anymore!\n\nBeware of the Dark Side!",1,0,1,0,1,1
43,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


# Severity rating label

In [7]:
df[cols].value_counts(normalize=True)

toxic  severe_toxic  obscene  threat  insult  identity_hate
1      0             0        0       0       0                0.349214
                     1        0       1       0                0.234206
                                      0       0                0.108351
                     0        0       1       0                0.074884
       1             1        0       1       0                0.060955
       0             1        0       1       1                0.038089
0      0             1        0       0       0                0.019538
                     0        0       1       0                0.018552
1      1             1        0       1       1                0.016333
0      0             1        0       1       0                0.011156
1      1             1        0       0       0                0.009738
       0             0        0       0       1                0.008382
                                      1       1                0.008259
    

In [8]:
def severity(row) -> float:
    res = 0
    res += 2/10 * row["toxic"]
    res += 4/10 * row["severe_toxic"]
    res += 1/10 * row["obscene"]
    res += 1/10 * row["threat"]
    res += 1/10 * row["insult"]
    res += 1/10 * row["identity_hate"]
    return res

In [9]:
col = "label"
df[col] = df.progress_apply(severity, axis=1)
df[col] = df[col].astype(np.float32)

100%|██████████████████████████████████████| 16225/16225 [00:00<00:00, 46221.80it/s]


In [10]:
df[col].describe(percentiles=percentiles)

count    16225.000000
mean         0.340074
std          0.184916
min          0.100000
1%           0.100000
5%           0.200000
10%          0.200000
20%          0.200000
30%          0.200000
40%          0.200000
50%          0.300000
60%          0.300000
70%          0.400000
80%          0.400000
90%          0.600000
95%          0.800000
99%          0.900000
max          1.000000
Name: label, dtype: float64

In [11]:
# Force unique ranks
df[col] = rankdata(df[col], method='ordinal')
df[col] = df[col].astype(np.int32)
df[col].describe(percentiles=percentiles)

count    16225.000000
mean      8113.000000
std       4683.898394
min          1.000000
1%         163.240000
5%         812.200000
10%       1623.400000
20%       3245.800000
30%       4868.200000
40%       6490.600000
50%       8113.000000
60%       9735.400000
70%      11357.800000
80%      12980.200000
90%      14602.600000
95%      15413.800000
99%      16062.760000
max      16225.000000
Name: label, dtype: float64

# Preprocess Text

In [12]:
def preprocess(row) -> str:
    res = row["comment_text"]
    res = " ".join(res.split())
    return res

In [13]:
col = "comment_text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|██████████████████████████████████████| 16225/16225 [00:00<00:00, 74770.98it/s]


# Embeddings

In [14]:
model = SentenceTransformer("pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2", device=device)
model.max_seq_length = 128
sentences = list(df["comment_text"])

In [15]:
%%time
em = model.encode(sentences=sentences, batch_size=256, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/64 [00:00<?, ?it/s]

em.shape=(16225, 384)
Wall time: 21 s


In [16]:
%%time
em_size = em.shape[1]
em_cols = [f"e{i}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
# avoid fragmented frame
df = df.copy()
del sentences

  self[col] = igetitem(value, i)


Wall time: 299 ms


In [17]:
cols = ["label"]
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16225 entries, 6 to 159554
Data columns (total 385 columns):
 #    Column  Non-Null Count  Dtype  
---   ------  --------------  -----  
 0    label   16225 non-null  int32  
 1    e0      16225 non-null  float32
 2    e1      16225 non-null  float32
 3    e2      16225 non-null  float32
 4    e3      16225 non-null  float32
 5    e4      16225 non-null  float32
 6    e5      16225 non-null  float32
 7    e6      16225 non-null  float32
 8    e7      16225 non-null  float32
 9    e8      16225 non-null  float32
 10   e9      16225 non-null  float32
 11   e10     16225 non-null  float32
 12   e11     16225 non-null  float32
 13   e12     16225 non-null  float32
 14   e13     16225 non-null  float32
 15   e14     16225 non-null  float32
 16   e15     16225 non-null  float32
 17   e16     16225 non-null  float32
 18   e17     16225 non-null  float32
 19   e18     16225 non-null  float32
 20   e19     16225 non-null  float32
 21   e20     1

In [18]:
%%time
df[cols].to_parquet("output/js18.parquet", index=False)

Wall time: 469 ms
