In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import textstat
from tqdm import tqdm
from typing import Dict, List, Tuple, NamedTuple, Callable
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 256
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=256, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger':

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
df = pd.read_parquet("input/pre_ruddit.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   5710 non-null   int32  
 1   bws     5710 non-null   float32
 2   worker  5710 non-null   int8   
 3   text    5710 non-null   object 
 4   text1   5710 non-null   object 
 5   text2   5710 non-null   object 
dtypes: float32(1), int32(1), int8(1), object(3)
memory usage: 184.1+ KB


# Character level features

In [5]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 4.03 ms


In [6]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
}

In [7]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 39102.53it/s]


letter_frac


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 37615.13it/s]


space_frac


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 38556.48it/s]


punc_frac


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 35456.70it/s]


upper_frac


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 39109.61it/s]


# Textstat features

In [8]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


textstat_fns: Dict[str, Callable] = {
    "syllables_per_word": syllables_per_word,
    "syllables_per_sent": syllables_per_sent,
    "words_per_sent": words_per_sent,
    "flesch_reading_ease": flesch_reading_ease,
    "flesch_kincaid_grade": flesch_kincaid_grade,
    "gunning_fog": gunning_fog,
    "smog_index": smog_index,
    "automated_readability_index": automated_readability_index,
    "coleman_liau_index": coleman_liau_index,
    "linsear_write_formula": linsear_write_formula,
    "dale_chall_readability_score": dale_chall_readability_score,
}

In [9]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|█████████████████████████████████████████| 5710/5710 [00:00<00:00, 7886.19it/s]


In [10]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 65567.04it/s]


In [11]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 37439.02it/s]


In [12]:
for col, fn in textstat_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

syllables_per_word


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 88439.39it/s]


syllables_per_sent


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 89440.14it/s]


words_per_sent


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 86514.23it/s]


flesch_reading_ease


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 12159.90it/s]


flesch_kincaid_grade


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 12638.88it/s]


gunning_fog


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 10196.46it/s]


smog_index


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 15861.13it/s]


automated_readability_index


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 27317.32it/s]


coleman_liau_index


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 23891.35it/s]


linsear_write_formula


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 12021.06it/s]


dale_chall_readability_score


100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 10437.77it/s]


# TweetEval labels

In [13]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|███████████████████████████████████████████████| 90/90 [04:11<00:00,  2.79s/it]


te_roberta_off torch.Size([5710, 2])
logits[:10]=tensor([[0.9025, 0.0975],
        [0.5804, 0.4196],
        [0.6679, 0.3321],
        [0.8045, 0.1955],
        [0.7880, 0.2120],
        [0.7823, 0.2177],
        [0.8272, 0.1728],
        [0.6837, 0.3163],
        [0.7598, 0.2402],
        [0.9035, 0.0965]])


100%|███████████████████████████████████████████████| 90/90 [04:06<00:00,  2.74s/it]


te_roberta_emo_anger torch.Size([5710, 4])
logits[:10]=tensor([[0.0376, 0.0565, 0.8628, 0.0431],
        [0.1369, 0.1012, 0.2558, 0.5061],
        [0.8166, 0.0105, 0.0391, 0.1338],
        [0.9419, 0.0041, 0.0231, 0.0308],
        [0.6684, 0.0184, 0.1338, 0.1794],
        [0.8703, 0.0090, 0.0602, 0.0605],
        [0.8556, 0.0071, 0.0948, 0.0425],
        [0.8475, 0.0155, 0.0260, 0.1111],
        [0.8054, 0.0086, 0.0704, 0.1156],
        [0.1417, 0.3192, 0.3703, 0.1687]])


100%|███████████████████████████████████████████████| 90/90 [04:05<00:00,  2.73s/it]


te_roberta_snt_neg torch.Size([5710, 3])
logits[:10]=tensor([[0.0407, 0.7561, 0.2032],
        [0.4268, 0.5488, 0.0244],
        [0.7662, 0.2259, 0.0080],
        [0.6066, 0.3729, 0.0206],
        [0.5111, 0.4282, 0.0607],
        [0.5525, 0.4095, 0.0379],
        [0.3226, 0.6394, 0.0380],
        [0.7787, 0.2147, 0.0066],
        [0.7998, 0.1863, 0.0139],
        [0.1055, 0.6091, 0.2854]])


100%|███████████████████████████████████████████████| 90/90 [04:05<00:00,  2.72s/it]


te_roberta_iro torch.Size([5710, 2])
logits[:10]=tensor([[0.7999, 0.2001],
        [0.4219, 0.5781],
        [0.3392, 0.6608],
        [0.8240, 0.1760],
        [0.8902, 0.1098],
        [0.6376, 0.3624],
        [0.7453, 0.2547],
        [0.0874, 0.9126],
        [0.8681, 0.1319],
        [0.6555, 0.3445]])


100%|███████████████████████████████████████████████| 90/90 [04:04<00:00,  2.72s/it]


te_xlm_roberta_snt_neg torch.Size([5710, 3])
logits[:10]=tensor([[0.1049, 0.7652, 0.1299],
        [0.1865, 0.7562, 0.0573],
        [0.4178, 0.4738, 0.1084],
        [0.7074, 0.2630, 0.0296],
        [0.7878, 0.1841, 0.0281],
        [0.5432, 0.4069, 0.0500],
        [0.5321, 0.4271, 0.0408],
        [0.5248, 0.4357, 0.0396],
        [0.9013, 0.0858, 0.0129],
        [0.3988, 0.4830, 0.1182]])


# HateBert labels

In [14]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [15]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=5710
Wall time: 1.01 s


In [16]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|███████████████████████████████████████████████| 45/45 [04:08<00:00,  5.53s/it]


hb_bert_off torch.Size([5710, 2])
logits[:10]=tensor([[0.9598, 0.0402],
        [0.7718, 0.2282],
        [0.7567, 0.2433],
        [0.9064, 0.0936],
        [0.8318, 0.1682],
        [0.7811, 0.2189],
        [0.5952, 0.4048],
        [0.8502, 0.1498],
        [0.7770, 0.2230],
        [0.9258, 0.0742]])


100%|███████████████████████████████████████████████| 45/45 [04:10<00:00,  5.56s/it]


hb_bert_abu torch.Size([5710, 2])
logits[:10]=tensor([[0.9910, 0.0090],
        [0.9717, 0.0283],
        [0.9545, 0.0455],
        [0.9631, 0.0369],
        [0.9714, 0.0286],
        [0.8686, 0.1314],
        [0.9001, 0.0999],
        [0.8544, 0.1456],
        [0.8708, 0.1292],
        [0.9881, 0.0119]])


100%|███████████████████████████████████████████████| 45/45 [04:11<00:00,  5.58s/it]


hb_hatebert_off torch.Size([5710, 2])
logits[:10]=tensor([[0.9377, 0.0623],
        [0.5526, 0.4474],
        [0.7553, 0.2447],
        [0.7828, 0.2172],
        [0.8787, 0.1213],
        [0.9267, 0.0733],
        [0.8304, 0.1696],
        [0.7886, 0.2114],
        [0.8038, 0.1962],
        [0.8965, 0.1035]])


100%|███████████████████████████████████████████████| 45/45 [04:09<00:00,  5.55s/it]

hb_hatebert_abu torch.Size([5710, 2])
logits[:10]=tensor([[0.9846, 0.0154],
        [0.9717, 0.0283],
        [0.9570, 0.0430],
        [0.9699, 0.0301],
        [0.9698, 0.0302],
        [0.9780, 0.0220],
        [0.9800, 0.0200],
        [0.9733, 0.0267],
        [0.9407, 0.0593],
        [0.9795, 0.0205]])





# Detoxify labels

In [17]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|█████████████████████████████████████████████████| 3/3 [04:01<00:00, 80.48s/it]


In [18]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [19]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-01-30 11:13:09,835 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-30 11:13:09,835 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-30 11:13:09,835 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-30 11:13:09,835 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

em.shape=(5710, 384)


In [20]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

Wall time: 245 ms


  self[col] = igetitem(value, i)


# Review data

In [21]:
cols = ["label", "bws", "worker", "length"]
cols += list(char_fns.keys())
cols += list(textstat_fns.keys())
cols += dtfy_fs
cols += list(conf.hatebert_models.keys()) 
cols += list(conf.tweeteval_models.keys()) 
df[cols].describe(percentiles=percentiles)

Unnamed: 0,label,bws,worker,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0
mean,2855.5,-0.027706,0.0,196.915587,0.00349,0.787317,0.173301,0.035892,0.03172,1.30703,13.23911,9.545128,74.9795,6.491471,8.701601,3.175884,7.604343,7.027691,8.01527,8.399787,0.177269,0.012918,0.116036,0.008211,0.05858,0.01103,0.19788,0.004827425,0.116517,0.015205,0.082358,0.01082,0.041887,0.204815,0.006113,0.109037,0.013888,0.083703,0.014809,0.046989,0.334841,0.166219,0.328586,0.141803,0.346535,0.503257,0.504242,0.303546,0.579387
std,1648.47935,0.334195,0.0,171.109023,0.01254,0.034357,0.02064,0.02553,0.048117,0.199335,7.826234,5.379773,19.854204,4.23719,4.464397,4.698273,5.243802,3.855352,5.516964,2.254551,0.327141,0.052013,0.285578,0.055493,0.17248,0.06257,0.344887,0.02284678,0.290073,0.074058,0.209154,0.068646,0.154251,0.348054,0.028899,0.278013,0.076619,0.215425,0.078792,0.161162,0.33387,0.279103,0.284754,0.219147,0.250738,0.358158,0.318861,0.271424,0.299713
min,1.0,-0.889,0.0,14.0,0.0,0.298246,0.040541,0.0,0.0,0.8,2.0,1.5,-48.98,-2.5,1.2,0.0,-6.8,-10.16,0.5,0.2,0.000508,8e-05,0.000141,8.6e-05,0.000164,0.000121,0.000286,9.469297e-07,1.7e-05,5.2e-05,6.1e-05,1.2e-05,9e-06,0.000186,9e-06,6.3e-05,4.5e-05,9.5e-05,1.4e-05,1.2e-05,0.008981,0.002733,0.006939,0.00705,0.023256,0.004901,0.000799,0.014366,0.009455
1%,58.09,-0.667,0.0,24.0,0.0,0.670577,0.1113,0.0,0.0,0.857143,2.666667,2.5,19.439399,-1.5,1.7272,0.0,-2.8,-2.55,1.25,0.35,0.000554,8.7e-05,0.000158,9.6e-05,0.000171,0.000134,0.00036,1.08221e-06,2.1e-05,6.3e-05,9.2e-05,1.5e-05,1.1e-05,0.000278,1.3e-05,9.4e-05,6.6e-05,0.000149,2.1e-05,1.7e-05,0.013535,0.003709,0.021296,0.010223,0.05036,0.011169,0.001821,0.022659,0.019474
5%,286.45,-0.521,0.0,33.0,0.0,0.7298,0.137931,0.008264,0.007047,1.0,4.0,3.0,41.441999,0.5,2.4,0.0,0.4,0.76,2.0,5.689,0.000613,9.3e-05,0.000165,0.000104,0.000175,0.000138,0.00041,1.20137e-06,2.4e-05,7e-05,0.000103,1.7e-05,1.2e-05,0.000351,1.6e-05,0.000114,7.8e-05,0.000178,2.7e-05,2e-05,0.020864,0.005157,0.036119,0.013418,0.073817,0.022917,0.00892,0.034882,0.054882
10%,571.9,-0.426,0.0,42.0,0.0,0.747899,0.148148,0.013889,0.009524,1.0625,5.0,3.5,50.5,1.7,3.2,0.0,1.7,2.23,2.5,6.27,0.000663,9.7e-05,0.00017,0.000108,0.000177,0.00014,0.000454,1.270248e-06,2.6e-05,7.6e-05,0.000112,1.9e-05,1.3e-05,0.000412,1.8e-05,0.000129,8.5e-05,0.000199,3e-05,2.2e-05,0.028689,0.006384,0.049691,0.015893,0.091589,0.038438,0.032618,0.046842,0.114343
20%,1142.8,-0.312,0.0,61.0,0.0,0.767677,0.158416,0.019417,0.012903,1.142857,6.5,5.0,60.139999,3.1,4.8,0.0,3.6,4.14,3.75,7.01,0.000786,0.000103,0.000176,0.000115,0.000181,0.000144,0.000573,1.407515e-06,3.1e-05,8.9e-05,0.000133,2.3e-05,1.6e-05,0.00057,2.2e-05,0.000158,9.9e-05,0.000246,3.7e-05,2.6e-05,0.045573,0.009076,0.077647,0.020805,0.123787,0.087268,0.141338,0.070889,0.246842
30%,1713.7,-0.213,0.0,82.0,0.0,0.777778,0.164983,0.023392,0.01585,1.206897,8.19,6.0,66.407003,4.2,6.14,0.0,5.0,5.297,4.666667,7.49,0.000983,0.000108,0.000182,0.000121,0.000187,0.00015,0.000852,1.642735e-06,4.1e-05,0.000113,0.000178,3e-05,2.1e-05,0.00085,2.7e-05,0.000199,0.000117,0.000324,4.5e-05,3.2e-05,0.071853,0.012562,0.112063,0.026856,0.160974,0.173743,0.285689,0.098981,0.390838
40%,2284.6,-0.146,0.0,106.0,0.0,0.784656,0.169903,0.027027,0.018968,1.255814,10.0,7.333333,71.650002,5.2,7.612,0.0,6.1,6.25,5.666667,7.92,0.001464,0.000114,0.000194,0.000127,0.000204,0.000163,0.001596,2.273925e-06,6.7e-05,0.000175,0.000311,4.8e-05,3.3e-05,0.001615,3.3e-05,0.00027,0.000151,0.00051,6.4e-05,4.4e-05,0.109913,0.017949,0.156008,0.034969,0.204146,0.315728,0.410188,0.140475,0.520841


In [22]:
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 433 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    label                         5710 non-null   int32  
 1    bws                           5710 non-null   float32
 2    worker                        5710 non-null   int8   
 3    length                        5710 non-null   int16  
 4    digit_frac                    5710 non-null   float32
 5    letter_frac                   5710 non-null   float32
 6    space_frac                    5710 non-null   float32
 7    punc_frac                     5710 non-null   float32
 8    upper_frac                    5710 non-null   float32
 9    syllables_per_word            5710 non-null   float32
 10   syllables_per_sent            5710 non-null   float32
 11   words_per_sent                5710 non-null   float32
 12   flesch_reading_ease           5710 non-null   

In [23]:
%%time
df[cols].to_parquet("output/tra.parquet", index=False)

Wall time: 219 ms
