In [7]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple, Callable
import scml
import mylib

In [8]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger': 

In [9]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [10]:
%%time
df = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB
Wall time: 213 ms


In [11]:
texts = set(df["less_toxic"]) | set(df["more_toxic"])
texts = list(texts)
df = pd.DataFrame(data={"text": texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
dtypes: object(1)
memory usage: 111.5+ KB


# Preprocess Text
Speed dropped from 1200 to 1000 it/s

In [12]:
def preprocess(row) -> str:
    return mylib.pre1(row["text"])


col = "ptext"
df[col] = df.progress_apply(preprocess, axis=1)

100%|████████████████████████████████████████| 14251/14251 [00:41<00:00, 345.33it/s]


# Character level features

In [13]:
%%time
col = "length"
df[col] = df["ptext"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 7.03 ms


In [14]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["ptext"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["ptext"])


def space_frac(row) -> float:
    return mylib.space_frac(row["ptext"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["ptext"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["ptext"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
}

In [15]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23324.20it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 21623.01it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22818.92it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 20817.23it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23318.15it/s]


# Textstat features

In [15]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["ptext"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["ptext"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["ptext"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["ptext"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["ptext"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["ptext"])


def smog_index(row) -> float:
    return textstat.smog_index(row["ptext"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["ptext"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["ptext"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["ptext"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["ptext"])


textstat_fns: Dict[str, Callable] = {
    "syllables_per_word": syllables_per_word,
    "syllables_per_sent": syllables_per_sent,
    "words_per_sent": words_per_sent,
    "flesch_reading_ease": flesch_reading_ease,
    "flesch_kincaid_grade": flesch_kincaid_grade,
    "gunning_fog": gunning_fog,
    "smog_index": smog_index,
    "automated_readability_index": automated_readability_index,
    "coleman_liau_index": coleman_liau_index,
    "linsear_write_formula": linsear_write_formula,
    "dale_chall_readability_score": dale_chall_readability_score,
}

In [16]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|███████████████████████████████████████| 14251/14251 [00:03<00:00, 4684.12it/s]


In [17]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 46267.35it/s]


In [18]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 26869.62it/s]


In [19]:
for col, fn in textstat_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 81423.33it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 89525.21it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 88716.79it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7145.71it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7189.52it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5619.09it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8099.73it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 18246.60it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 16321.24it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 9291.80it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 6103.60it/s]


# TweetEval labels

In [20]:
sentences = list(df["ptext"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:14<00:00,  2.75s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0947, 0.9053],
        [0.5495, 0.4505],
        [0.8659, 0.1341],
        [0.1781, 0.8219],
        [0.5549, 0.4451],
        [0.8255, 0.1745],
        [0.2228, 0.7772],
        [0.4856, 0.5144],
        [0.4603, 0.5397],
        [0.3329, 0.6671]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.9764, 0.0057, 0.0108, 0.0071],
        [0.9322, 0.0086, 0.0245, 0.0346],
        [0.9679, 0.0029, 0.0163, 0.0129],
        [0.9761, 0.0121, 0.0060, 0.0058],
        [0.8135, 0.0130, 0.0427, 0.1308],
        [0.8213, 0.0241, 0.0632, 0.0914],
        [0.9570, 0.0043, 0.0145, 0.0241],
        [0.9758, 0.0044, 0.0094, 0.0104],
        [0.9809, 0.0034, 0.0073, 0.0084],
        [0.9181, 0.0060, 0.0555, 0.0204]])


100%|█████████████████████████████████████████████| 223/223 [10:19<00:00,  2.78s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.9744, 0.0232, 0.0024],
        [0.6614, 0.3068, 0.0318],
        [0.6567, 0.3218, 0.0215],
        [0.5749, 0.2724, 0.1527],
        [0.6408, 0.2849, 0.0743],
        [0.2887, 0.6962, 0.0151],
        [0.7549, 0.2277, 0.0173],
        [0.8203, 0.1585, 0.0212],
        [0.8335, 0.1561, 0.0104],
        [0.8384, 0.1358, 0.0259]])


100%|█████████████████████████████████████████████| 223/223 [10:28<00:00,  2.82s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.7821, 0.2179],
        [0.6187, 0.3813],
        [0.9035, 0.0965],
        [0.6949, 0.3051],
        [0.9410, 0.0590],
        [0.9679, 0.0321],
        [0.9385, 0.0615],
        [0.7492, 0.2508],
        [0.8881, 0.1119],
        [0.9203, 0.0797]])


100%|█████████████████████████████████████████████| 223/223 [10:22<00:00,  2.79s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.9396, 0.0460, 0.0144],
        [0.8887, 0.0832, 0.0281],
        [0.8222, 0.1263, 0.0515],
        [0.7152, 0.1767, 0.1081],
        [0.6470, 0.2220, 0.1309],
        [0.1476, 0.8229, 0.0295],
        [0.9044, 0.0806, 0.0150],
        [0.8245, 0.1383, 0.0372],
        [0.6230, 0.2731, 0.1040],
        [0.9424, 0.0392, 0.0184]])


# HateBert labels

In [21]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [22]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.33 s


In [23]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:31<00:00,  5.64s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0264, 0.9736],
        [0.4376, 0.5624],
        [0.9560, 0.0440],
        [0.0355, 0.9645],
        [0.0438, 0.9562],
        [0.8932, 0.1068],
        [0.0405, 0.9595],
        [0.2133, 0.7867],
        [0.2547, 0.7453],
        [0.1310, 0.8690]])


100%|█████████████████████████████████████████████| 112/112 [10:32<00:00,  5.65s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.0220, 0.9780],
        [0.5668, 0.4332],
        [0.9904, 0.0096],
        [0.0692, 0.9308],
        [0.9385, 0.0615],
        [0.9801, 0.0199],
        [0.0708, 0.9292],
        [0.0684, 0.9316],
        [0.8903, 0.1097],
        [0.2068, 0.7932]])


100%|█████████████████████████████████████████████| 112/112 [10:33<00:00,  5.66s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0347, 0.9653],
        [0.6652, 0.3348],
        [0.9384, 0.0616],
        [0.0696, 0.9304],
        [0.9631, 0.0369],
        [0.9538, 0.0462],
        [0.1437, 0.8563],
        [0.1497, 0.8503],
        [0.3690, 0.6310],
        [0.9222, 0.0778]])


100%|█████████████████████████████████████████████| 112/112 [10:30<00:00,  5.63s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.0222, 0.9778],
        [0.8616, 0.1384],
        [0.9726, 0.0274],
        [0.5340, 0.4660],
        [0.9764, 0.0236],
        [0.9816, 0.0184],
        [0.1430, 0.8570],
        [0.1326, 0.8674],
        [0.3410, 0.6590],
        [0.9003, 0.0997]])





# Detoxify labels

In [24]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:52<00:00, 617.40s/it]


In [25]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [26]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2021-12-27 10:40:48,447 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [27]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 284 ms


# Review data

In [28]:
cols = ["length"]
cols += list(char_fns.keys())
cols += list(textstat_fns.keys())
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,408.685215,0.009753,0.772897,0.174,0.04335,0.086133,0.457963,0.067594,0.294827,0.028735,0.277558,0.061917,0.53337,0.046157,0.297042,0.06187,0.30528,0.01802,0.09885,0.509926,0.062105,0.259724,0.056295,0.269493,0.026781,0.135095,1.359795,19.295958,13.647533,63.885544,9.638151,11.242299,4.308736,13.048552,9.084304,9.928492,9.515097,0.635858,0.497906,0.6027,0.457732,0.544533,0.812228,0.716471,0.253569,0.741067
std,689.012211,0.032186,0.056108,0.024862,0.043432,0.176843,0.427487,0.155722,0.393257,0.123379,0.36876,0.177829,0.398233,0.120635,0.390625,0.161759,0.360576,0.09552,0.217878,0.413616,0.158131,0.371579,0.16241,0.351503,0.111057,0.273038,2.770947,39.943142,27.374922,315.730133,48.16795,21.530439,5.288795,90.731934,97.974464,9.322924,3.392001,0.346745,0.399941,0.331597,0.380082,0.269306,0.273816,0.260878,0.223407,0.225986
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.00053,7.9e-05,0.000149,8.5e-05,0.000164,0.000126,0.000352,1e-06,1.7e-05,6e-05,6.1e-05,1.5e-05,1e-05,0.000126,1.1e-05,5.2e-05,7.1e-05,0.000128,1.7e-05,1.3e-05,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.009207,0.002739,0.008755,0.005502,0.026456,0.00561,0.001072,0.012607,0.009896
1%,22.0,0.0,0.547782,0.095238,0.0,0.0,0.000661,8.7e-05,0.000165,9.9e-05,0.000175,0.000136,0.001285,3e-06,7.1e-05,0.000135,0.000104,3.3e-05,3.6e-05,0.000404,1.6e-05,8.5e-05,0.000109,0.000182,2.7e-05,1.8e-05,0.857143,2.5,2.0,-70.965,-1.9,1.6,0.0,-2.9,-2.95,1.0,1.105,0.018364,0.004951,0.025188,0.010856,0.08333,0.027914,0.011866,0.030347,0.060248
5%,31.0,0.0,0.68337,0.131579,0.0,0.0,0.000958,9.3e-05,0.000179,0.000109,0.000186,0.000145,0.005716,7e-06,0.000269,0.000346,0.000297,6.7e-05,9.9e-05,0.00107,2.4e-05,0.00015,0.000175,0.000375,3.8e-05,2.5e-05,1.0,3.5,3.0,30.200001,0.5,2.4,0.0,0.3,0.45,2.0,6.34,0.042358,0.009112,0.056846,0.017702,0.137973,0.103575,0.132166,0.046025,0.235738
10%,44.0,0.0,0.723404,0.145161,0.012552,0.008547,0.00185,0.0001,0.000211,0.000119,0.000219,0.000164,0.016219,1.3e-05,0.000535,0.000584,0.000896,9.9e-05,0.000185,0.00329,3.6e-05,0.000332,0.000294,0.000901,5.9e-05,4e-05,1.071429,5.0,3.5,43.400002,1.9,3.2,0.0,2.0,2.37,2.5,6.92,0.081867,0.015292,0.098485,0.025749,0.183188,0.278021,0.303,0.057584,0.391091
20%,69.0,0.0,0.75,0.15873,0.020173,0.016461,0.009198,0.000114,0.000473,0.000164,0.000519,0.00026,0.06009,3e-05,0.001509,0.001149,0.00382,0.000176,0.000441,0.022068,0.000106,0.001394,0.000728,0.003948,0.000168,0.000138,1.164384,7.0,5.333333,55.580002,3.5,5.2,0.0,4.1,4.28,4.0,7.63,0.204469,0.038299,0.212857,0.048906,0.261511,0.699697,0.503324,0.078534,0.56359
30%,99.0,0.0,0.765027,0.166205,0.025,0.021622,0.029902,0.000154,0.001081,0.000285,0.001396,0.000481,0.154298,6.3e-05,0.004123,0.002022,0.012448,0.000273,0.000933,0.090868,0.000283,0.003941,0.001466,0.010724,0.000392,0.000393,1.222222,9.0,6.666667,62.509998,4.6,6.74,0.0,5.6,5.5,5.0,8.14,0.397119,0.091536,0.356134,0.094477,0.341236,0.871067,0.628663,0.101488,0.682353
40%,138.0,0.0,0.775087,0.171429,0.029598,0.026316,0.092392,0.000304,0.003347,0.000498,0.004704,0.001104,0.326191,0.000128,0.011421,0.003398,0.036528,0.000409,0.001894,0.255474,0.000665,0.010032,0.00255,0.024568,0.000749,0.000917,1.277778,11.25,8.333333,68.059998,5.7,8.04,0.0,6.9,6.49,6.25,8.57,0.611989,0.227071,0.530644,0.187363,0.431392,0.928468,0.727975,0.130173,0.770484


In [29]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    dto_toxicity                  14251 non-null  float32
 8    dto_severe_toxicity           14251 non-null  float32
 9    dto_obscene                   14251 non-null  float32
 10   dto_threat                    14251 non-null  float32
 11   dto_insult                    14251 non-null  float32
 12   dto_identity_attack           14251 non-null

In [30]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 511 ms
