In [1]:
import os
import gc
import json
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
import textstat
from tqdm import tqdm
from typing import List, Set, Dict, Tuple, NamedTuple, Callable, Any
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    data_dir: str = "data/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
    vocab_file: str = f"{data_dir}vocab.json"
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', data_dir='data/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_ro

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_parquet("input/pre_val.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
 3   text3   14251 non-null  object
dtypes: object(4)
memory usage: 445.5+ KB
Wall time: 164 ms


# Character level features

In [5]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 7.34 ms


In [6]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


def repeat_char_frac(row) -> float:
    return mylib.repeat_char_frac(row["text1"])


def repeat_substring_frac(row) -> float:
    return mylib.repeat_substring_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
    "repeat_char_frac": repeat_char_frac,
    "repeat_substring_frac": repeat_substring_frac,
}

In [7]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23548.75it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22741.23it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 24108.60it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22388.54it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 24893.33it/s]


repeat_char_frac


100%|██████████████████████████████████████| 14251/14251 [00:01<00:00, 12727.99it/s]


repeat_substring_frac


100%|████████████████████████████████████████| 14251/14251 [02:04<00:00, 114.89it/s]


# Textstat features

In [8]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


preprocess_fns: List[Tuple[str, Callable, Any]] = [
    ("syllable_count", syllable_count, np.int32),
    ("lexicon_count", lexicon_count, np.int32),
    ("sentence_count", sentence_count, np.int32),
]
textstat_fns: List[Tuple[str, Callable, Any]] = [
    ("syllables_per_word", syllables_per_word, np.float32),
    ("syllables_per_sent", syllables_per_sent, np.float32),
    ("words_per_sent", words_per_sent, np.float32),
    ("flesch_reading_ease", flesch_reading_ease, np.float32),
    ("flesch_kincaid_grade", flesch_kincaid_grade, np.float32),
    ("gunning_fog", gunning_fog, np.float32),
    ("smog_index", smog_index, np.float32),
    ("automated_readability_index", automated_readability_index, np.float32),
    ("coleman_liau_index", coleman_liau_index, np.float32),
    ("linsear_write_formula", linsear_write_formula, np.float32),
    ("dale_chall_readability_score", dale_chall_readability_score, np.float32),
]

In [9]:
for col, fn, dtype in preprocess_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)
for col, fn, dtype in textstat_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)

syllable_count


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5245.14it/s]


lexicon_count


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 51724.71it/s]


sentence_count


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 27252.11it/s]


syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 92062.04it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 90137.99it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 98583.95it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7560.36it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7607.65it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5763.94it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8233.28it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 18318.26it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 16546.59it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 9281.54it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5917.57it/s]


# TF-IDF features

In [10]:
vocabulary, idf = [], []
with open(conf.vocab_file) as f:
    tmp = json.load(f)
    vocabulary = tmp["term"]
    idf = tmp["idf"]
print(f"len(vocab)={len(vocabulary)}\n{vocabulary}")
assert len(vocabulary) == len(idf)

len(vocab)=550
['african', 'african american', 'alabama hot pocket', 'alaskan pipeline', 'american', 'anal', 'analplug', 'analsex', 'anilingus', 'anus', 'apeshit', 'arse', 'arsehole', 'asian', 'ass', 'assassin', 'asshole', 'assmunch', 'atheist', 'auto erotic', 'autoerotic', 'babeland', 'baby batter', 'baby juice', 'ball', 'ball gag', 'ball gravy', 'ball kicking', 'ball lick', 'ball sack', 'ball suck', 'bangbro', 'bangbus', 'bareback', 'barely legal', 'barenaked', 'bastard', 'bastardo', 'bastinado', 'bbw', 'bdsm', 'beaner', 'beastiality', 'beaver cleaver', 'beaver lip', 'bestiality', 'bewb', 'big black', 'big breast', 'big knocker', 'big tit', 'bimbo', 'birdlock', 'bisexual', 'bitch', 'black', 'black cock', 'blind', 'blonde action', 'bloody', 'bloodyhell', 'blow', 'blow job', 'blow your load', 'blowjob', 'blue waffle', 'blumpkin', 'bollock', 'bondage', 'boner', 'boob', 'booby', 'booty call', 'boy', 'brown shower', 'brunette action', 'buddhist', 'bugger', 'bukkake', 'bulldyke', 'bullet v

In [11]:
%%time
vec = TfidfVectorizer(vocabulary=vocabulary, ngram_range=(1, 3), analyzer="word")
vec.idf_ = idf
x = vec.transform(df["text3"])
print(f"x.shape={x.shape}\n{x[0]}")
ti_features = [f"ti{i:04d}" for i in range(x.shape[1])]
df[ti_features] = scipy.sparse.csr_matrix(x).todense()
df[ti_features] = df[ti_features].astype(np.float32)
df = df.copy()  # defragment

x.shape=(14251, 550)
  (0, 244)	0.6119909996152153
  (0, 36)	0.790864726985576


  self[col] = igetitem(value, i)


Wall time: 6.53 s


# TweetEval labels

In [12]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:14<00:00,  2.76s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0897, 0.9103],
        [0.4368, 0.5632],
        [0.4233, 0.5767],
        [0.6812, 0.3188],
        [0.5885, 0.4115],
        [0.1978, 0.8022],
        [0.5222, 0.4778],
        [0.5216, 0.4784],
        [0.3207, 0.6793],
        [0.2788, 0.7212]])


100%|█████████████████████████████████████████████| 223/223 [10:11<00:00,  2.74s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.9486, 0.0136, 0.0146, 0.0232],
        [0.8915, 0.0076, 0.0796, 0.0213],
        [0.9645, 0.0048, 0.0144, 0.0163],
        [0.8196, 0.0089, 0.0224, 0.1491],
        [0.5778, 0.2960, 0.0831, 0.0431],
        [0.9695, 0.0071, 0.0167, 0.0068],
        [0.8930, 0.0253, 0.0510, 0.0307],
        [0.8952, 0.0110, 0.0314, 0.0624],
        [0.9316, 0.0094, 0.0303, 0.0286],
        [0.3738, 0.1225, 0.0421, 0.4617]])


100%|█████████████████████████████████████████████| 223/223 [10:10<00:00,  2.74s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.9661, 0.0292, 0.0048],
        [0.6812, 0.2726, 0.0463],
        [0.8914, 0.0960, 0.0126],
        [0.7866, 0.1905, 0.0230],
        [0.3057, 0.3798, 0.3145],
        [0.6099, 0.2633, 0.1268],
        [0.5471, 0.3905, 0.0624],
        [0.9348, 0.0600, 0.0052],
        [0.9410, 0.0568, 0.0022],
        [0.9013, 0.0844, 0.0143]])


100%|█████████████████████████████████████████████| 223/223 [10:11<00:00,  2.74s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.1865, 0.8135],
        [0.8514, 0.1486],
        [0.9185, 0.0815],
        [0.9658, 0.0342],
        [0.3756, 0.6244],
        [0.7501, 0.2499],
        [0.2220, 0.7780],
        [0.4478, 0.5522],
        [0.3767, 0.6233],
        [0.6186, 0.3814]])


100%|█████████████████████████████████████████████| 223/223 [10:17<00:00,  2.77s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.9357, 0.0423, 0.0220],
        [0.8736, 0.0934, 0.0330],
        [0.8425, 0.1144, 0.0430],
        [0.6363, 0.2525, 0.1112],
        [0.4790, 0.2407, 0.2803],
        [0.3000, 0.2898, 0.4102],
        [0.8004, 0.1693, 0.0304],
        [0.9360, 0.0464, 0.0176],
        [0.9145, 0.0657, 0.0198],
        [0.7871, 0.1474, 0.0655]])


# HateBert labels

In [13]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [14]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.27 s


In [15]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:30<00:00,  5.63s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0297, 0.9703],
        [0.9436, 0.0564],
        [0.8174, 0.1826],
        [0.7689, 0.2311],
        [0.4410, 0.5590],
        [0.0291, 0.9709],
        [0.3843, 0.6157],
        [0.3467, 0.6533],
        [0.0597, 0.9403],
        [0.1014, 0.8986]])


100%|█████████████████████████████████████████████| 112/112 [10:38<00:00,  5.70s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.0325, 0.9675],
        [0.9869, 0.0131],
        [0.4871, 0.5129],
        [0.9427, 0.0573],
        [0.8460, 0.1540],
        [0.0346, 0.9654],
        [0.7741, 0.2259],
        [0.3103, 0.6897],
        [0.0607, 0.9393],
        [0.4142, 0.5858]])


100%|█████████████████████████████████████████████| 112/112 [10:31<00:00,  5.64s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0373, 0.9627],
        [0.3147, 0.6853],
        [0.1748, 0.8252],
        [0.5645, 0.4355],
        [0.3441, 0.6559],
        [0.0668, 0.9332],
        [0.7968, 0.2032],
        [0.2147, 0.7853],
        [0.1666, 0.8334],
        [0.1239, 0.8761]])


100%|█████████████████████████████████████████████| 112/112 [10:24<00:00,  5.58s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.1288, 0.8712],
        [0.9283, 0.0717],
        [0.2834, 0.7166],
        [0.7350, 0.2650],
        [0.4876, 0.5124],
        [0.2505, 0.7495],
        [0.9280, 0.0720],
        [0.6010, 0.3990],
        [0.2362, 0.7638],
        [0.2507, 0.7493]])





# Detoxify labels

In [16]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:36<00:00, 612.02s/it]


In [17]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [18]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-02-05 15:47:59,183 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-05 15:47:59,183 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-05 15:47:59,183 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-05 15:47:59,183 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [19]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

Wall time: 358 ms


# Review data

In [20]:
cols = ["length"]
cols += list(char_fns.keys())
cols += [x[0] for x in textstat_fns]
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
cols += ti_features
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,repeat_char_frac,repeat_substring_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg,ti0000,ti0001,ti0002,ti0003,ti0004,ti0005,ti0006,ti0007,ti0008,ti0009,ti0010,ti0011,ti0012,ti0013,ti0014,ti0015,ti0016,ti0017,ti0018,ti0019,ti0020,ti0021,ti0022,ti0023,ti0024,ti0025,ti0026,ti0027,ti0028,ti0029,ti0030,ti0031,ti0032,ti0033,ti0034,ti0035,ti0036,ti0037,ti0038,ti0039,ti0040,ti0041,ti0042,ti0043,ti0044,ti0045,ti0046,ti0047,ti0048,ti0049,ti0050,ti0051,ti0052,ti0053,ti0054,ti0055,ti0056,ti0057,ti0058,ti0059,ti0060,ti0061,ti0062,ti0063,ti0064,ti0065,ti0066,ti0067,ti0068,ti0069,ti0070,ti0071,ti0072,ti0073,ti0074,ti0075,ti0076,ti0077,ti0078,ti0079,ti0080,ti0081,ti0082,ti0083,ti0084,ti0085,ti0086,ti0087,ti0088,ti0089,ti0090,ti0091,ti0092,ti0093,ti0094,ti0095,ti0096,ti0097,ti0098,ti0099,ti0100,ti0101,ti0102,ti0103,ti0104,ti0105,ti0106,ti0107,ti0108,ti0109,ti0110,ti0111,ti0112,ti0113,ti0114,ti0115,ti0116,ti0117,ti0118,ti0119,ti0120,ti0121,ti0122,ti0123,ti0124,ti0125,ti0126,ti0127,ti0128,ti0129,ti0130,ti0131,ti0132,ti0133,ti0134,ti0135,ti0136,ti0137,ti0138,ti0139,ti0140,ti0141,ti0142,ti0143,ti0144,ti0145,ti0146,ti0147,ti0148,ti0149,ti0150,ti0151,ti0152,ti0153,ti0154,ti0155,ti0156,ti0157,ti0158,ti0159,ti0160,ti0161,ti0162,ti0163,ti0164,ti0165,ti0166,ti0167,ti0168,ti0169,ti0170,ti0171,ti0172,ti0173,ti0174,ti0175,ti0176,ti0177,ti0178,ti0179,ti0180,ti0181,ti0182,ti0183,ti0184,ti0185,ti0186,ti0187,ti0188,ti0189,ti0190,ti0191,ti0192,ti0193,ti0194,ti0195,ti0196,ti0197,ti0198,ti0199,ti0200,ti0201,ti0202,ti0203,ti0204,ti0205,ti0206,ti0207,ti0208,ti0209,ti0210,ti0211,ti0212,ti0213,ti0214,ti0215,ti0216,ti0217,ti0218,ti0219,ti0220,ti0221,ti0222,ti0223,ti0224,ti0225,ti0226,ti0227,ti0228,ti0229,ti0230,ti0231,ti0232,ti0233,ti0234,ti0235,ti0236,ti0237,ti0238,ti0239,ti0240,ti0241,ti0242,ti0243,ti0244,ti0245,ti0246,ti0247,ti0248,ti0249,ti0250,ti0251,ti0252,ti0253,ti0254,ti0255,ti0256,ti0257,ti0258,ti0259,ti0260,ti0261,ti0262,ti0263,ti0264,ti0265,ti0266,ti0267,ti0268,ti0269,ti0270,ti0271,ti0272,ti0273,ti0274,ti0275,ti0276,ti0277,ti0278,ti0279,ti0280,ti0281,ti0282,ti0283,ti0284,ti0285,ti0286,ti0287,ti0288,ti0289,ti0290,ti0291,ti0292,ti0293,ti0294,ti0295,ti0296,ti0297,ti0298,ti0299,ti0300,ti0301,ti0302,ti0303,ti0304,ti0305,ti0306,ti0307,ti0308,ti0309,ti0310,ti0311,ti0312,ti0313,ti0314,ti0315,ti0316,ti0317,ti0318,ti0319,ti0320,ti0321,ti0322,ti0323,ti0324,ti0325,ti0326,ti0327,ti0328,ti0329,ti0330,ti0331,ti0332,ti0333,ti0334,ti0335,ti0336,ti0337,ti0338,ti0339,ti0340,ti0341,ti0342,ti0343,ti0344,ti0345,ti0346,ti0347,ti0348,ti0349,ti0350,ti0351,ti0352,ti0353,ti0354,ti0355,ti0356,ti0357,ti0358,ti0359,ti0360,ti0361,ti0362,ti0363,ti0364,ti0365,ti0366,ti0367,ti0368,ti0369,ti0370,ti0371,ti0372,ti0373,ti0374,ti0375,ti0376,ti0377,ti0378,ti0379,ti0380,ti0381,ti0382,ti0383,ti0384,ti0385,ti0386,ti0387,ti0388,ti0389,ti0390,ti0391,ti0392,ti0393,ti0394,ti0395,ti0396,ti0397,ti0398,ti0399,ti0400,ti0401,ti0402,ti0403,ti0404,ti0405,ti0406,ti0407,ti0408,ti0409,ti0410,ti0411,ti0412,ti0413,ti0414,ti0415,ti0416,ti0417,ti0418,ti0419,ti0420,ti0421,ti0422,ti0423,ti0424,ti0425,ti0426,ti0427,ti0428,ti0429,ti0430,ti0431,ti0432,ti0433,ti0434,ti0435,ti0436,ti0437,ti0438,ti0439,ti0440,ti0441,ti0442,ti0443,ti0444,ti0445,ti0446,ti0447,ti0448,ti0449,ti0450,ti0451,ti0452,ti0453,ti0454,ti0455,ti0456,ti0457,ti0458,ti0459,ti0460,ti0461,ti0462,ti0463,ti0464,ti0465,ti0466,ti0467,ti0468,ti0469,ti0470,ti0471,ti0472,ti0473,ti0474,ti0475,ti0476,ti0477,ti0478,ti0479,ti0480,ti0481,ti0482,ti0483,ti0484,ti0485,ti0486,ti0487,ti0488,ti0489,ti0490,ti0491,ti0492,ti0493,ti0494,ti0495,ti0496,ti0497,ti0498,ti0499,ti0500,ti0501,ti0502,ti0503,ti0504,ti0505,ti0506,ti0507,ti0508,ti0509,ti0510,ti0511,ti0512,ti0513,ti0514,ti0515,ti0516,ti0517,ti0518,ti0519,ti0520,ti0521,ti0522,ti0523,ti0524,ti0525,ti0526,ti0527,ti0528,ti0529,ti0530,ti0531,ti0532,ti0533,ti0534,ti0535,ti0536,ti0537,ti0538,ti0539,ti0540,ti0541,ti0542,ti0543,ti0544,ti0545,ti0546,ti0547,ti0548,ti0549
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,404.495825,0.003891,0.778295,0.172929,0.044886,0.088107,0.027886,0.018796,1.360076,18.846472,13.300896,64.286728,9.450614,11.062636,4.342566,12.748144,8.927725,9.590233,9.313502,0.456116,0.065378,0.297211,0.027498,0.274525,0.063306,0.534237,0.046076,0.298713,0.065511,0.312356,0.017787,0.109875,0.504705,0.060798,0.262797,0.061645,0.274358,0.027242,0.141008,0.643162,0.50132,0.603338,0.460689,0.543847,0.810982,0.714188,0.250652,0.744577,0.002692,0.000905,0.0,0.0,0.010127,0.002799,0.0,0.0,2.1e-05,0.001411,0.0,0.001971,0.000548,0.002048,0.022252,0.0,0.014348,6.3e-05,0.001334,0.0,0.0,0.0,0.0,0.0,0.004975,0.0,0.0,0.0,0.0,5.5e-05,3.9e-05,0.0,0.0,0.0,3.1e-05,0.0,0.006216,4.6e-05,0.0,4.1e-05,3.6e-05,0.000115,0.0,0.0,0.0,7e-05,0.0,0.00016,4.7e-05,0.0,0.0,1.8e-05,0.0,0.000835,0.01722,0.013245,5.6e-05,0.002478,0.0,0.003561,0.0,0.005856,5.6e-05,4.7e-05,0.000261,0.0,3.6e-05,0.000638,0.000172,0.000183,0.000511,0.000134,0.0,0.008103,0.0,0.0,0.000367,0.000529,4.3e-05,0.0,0.0,0.008697,0.0,4.6e-05,0.0,0.003945,0.0,0.0,4.7e-05,0.0,0.0,0.0001,0.001339,0.0,0.0,0.002612,0.0,0.004088,0.000349,0.0,0.0,0.005492,0.000117,7e-05,0.0,0.0001,0.000127,0.0,8.9e-05,0.006573,0.001579,0.000559,0.0,0.0,0.0,0.0,0.0,0.017233,0.0,0.001341,8.5e-05,7e-05,7.3e-05,0.007808,7e-05,0.000398,0.011566,0.0,1.2e-05,0.0,0.000395,0.0,0.0,0.0,0.013006,0.001701,0.000772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.6e-05,0.0,0.000523,0.0,0.0,0.000313,0.000159,0.0,0.0,0.002984,0.0,0.0,0.0,0.000366,0.0,0.0,0.00017,0.000214,0.000251,0.0,0.000268,6.3e-05,0.00295,0.0,0.004443,0.014655,8.2e-05,0.001091,7e-05,6.8e-05,0.0,0.0,0.00031,0.000291,0.0,0.000244,0.0,0.003459,0.0,0.0,0.000463,6.4e-05,0.001377,0.0,7e-05,3.1e-05,0.0,0.000461,0.0,0.056089,0.0,0.004217,0.000141,0.00332,0.027648,0.0,0.000415,0.0,6.2e-05,0.0,0.000581,0.0,0.0,0.000134,0.0,0.016735,0.000259,0.000458,0.000142,4.6e-05,0.005561,0.0,5e-06,0.0,3.1e-05,0.0,0.0,7e-05,0.0,0.001277,0.0,0.0,0.0,0.000126,0.001435,0.0,7e-05,0.0,0.0,0.000151,0.0,0.00011,2.2e-05,6.5e-05,0.0,5.5e-05,4.8e-05,0.000279,0.000287,0.021419,0.000212,0.000472,0.000902,0.004618,0.001883,0.000232,0.005576,4.7e-05,0.000263,6.4e-05,0.000368,0.0,7e-05,0.0,0.0,5.6e-05,0.000283,0.000586,0.003826,0.000551,0.005385,0.000418,0.0,0.0,0.001981,0.0,0.00049,0.0,0.008199,3.2e-05,0.0,0.0,0.0,0.000131,0.0,0.0,0.0,0.000416,0.015627,0.001701,0.0,0.0,0.0,0.000202,0.000332,5.9e-05,2.3e-05,1.9e-05,0.000581,0.0,0.0,0.0,0.0,0.001916,0.0,0.000433,0.0,0.0,0.0,0.0,0.0,0.003899,0.0,0.024055,0.000991,0.002641,0.0,0.001399,0.000146,0.00017,4.7e-05,0.0,4.3e-05,0.000713,7e-05,0.009917,5.7e-05,0.00285,0.0,4.9e-05,0.0,0.0,0.0,0.0,0.0045,0.001423,0.005581,0.000458,0.0,0.009902,0.001049,0.0,0.0,0.001494,0.0,0.005423,0.0,0.000287,0.0,9.4e-05,3.2e-05,0.0,0.000416,7.2e-05,5e-05,0.0,0.0,0.0,0.0,0.01531,0.0,0.000283,0.000375,4.3e-05,0.00068,0.00024,0.000338,0.0,0.0,5.3e-05,0.000401,0.0,0.000159,0.002473,0.000277,6.6e-05,0.005988,0.0,0.0,0.004753,0.003913,0.0,0.000383,0.007551,0.0,0.0,0.000122,0.0,5.2e-05,0.0,0.000223,0.000157,0.0,0.001626,0.0,0.0,0.002441,0.000242,0.003552,0.0041,3.9e-05,0.0,0.002023,0.000469,0.0,0.0,0.0,0.0,0.003349,0.0,4.5e-05,0.002355,0.0,0.0,6e-05,0.006862,0.001491,7e-05,0.0003,0.0,0.000158,0.0,0.0,0.0,0.0,0.000215,7e-05,0.0,0.0,7.3e-05,0.004077,0.000209,0.000817,0.006448,0.0,0.0,0.004475,0.001287,0.002285,0.001387,9.5e-05,0.0,0.0,7e-05,0.0,0.029205,0.0,0.000487,0.001112,5.8e-05,0.0,0.0,0.000584,4.2e-05,9.8e-05,0.0,0.000923,0.0,5.2e-05,6.5e-05,0.015466,7.9e-05,0.000107,0.000316,0.0,0.000237,0.000185,0.000152,0.0,0.0,0.0,0.0,0.0,9.7e-05,0.003473,5.9e-05,0.0,0.0,0.0,0.000236,0.0,0.020574,0.0,0.0,0.000568,0.0,0.000227,0.0,0.0,0.000114,0.0,0.0,0.000352,0.004734,0.000308,6.1e-05,0.0,0.000202,2.4e-05,0.000753,0.000251,0.0,0.0,0.0,0.000612,0.0,0.000148,0.000267,0.00088,0.0,0.0,5.5e-05,0.000814,0.0,0.002241,0.000118,1.6e-05,0.0,0.0,0.0,0.0,9.1e-05,0.009873,0.000317,0.0,8.3e-05,0.000106,0.0,0.0,0.0,0.0,0.0,0.000192,0.000652,0.001828,0.000471,9.8e-05,0.0,0.010083,0.000112,0.0,0.002689,5e-05,0.014528,0.0,0.0,0.0,0.004564,0.000241,0.000416,4.8e-05,0.0,0.0,0.003488,6.2e-05
std,686.044494,0.016081,0.047163,0.023382,0.043165,0.179211,0.045813,0.104194,2.745461,40.391994,27.804605,314.100159,48.095737,21.882164,5.233645,90.17897,97.308975,9.253856,3.154639,0.426943,0.149971,0.394502,0.120155,0.366261,0.178526,0.401407,0.118385,0.39407,0.167979,0.364134,0.094171,0.23795,0.414346,0.154813,0.375653,0.173048,0.354962,0.111715,0.284307,0.344624,0.399918,0.33225,0.382369,0.269436,0.275125,0.261056,0.218183,0.22312,0.042294,0.020973,0.0,0.0,0.086632,0.047013,0.0,0.0,0.002451,0.034548,0.0,0.039287,0.020795,0.039326,0.126246,0.0,0.106193,0.007467,0.033299,0.0,0.0,0.0,0.0,0.0,0.061209,0.0,0.0,0.0,0.0,0.006621,0.004633,0.0,0.0,0.0,0.003684,0.0,0.068749,0.00554,0.0,0.004919,0.004249,0.009738,0.0,0.0,0.0,0.008377,0.0,0.010319,0.005634,0.0,0.0,0.002109,0.0,0.022065,0.110774,0.100053,0.006669,0.044237,0.0,0.054921,0.0,0.067387,0.006684,0.005622,0.015717,0.0,0.004288,0.024346,0.010954,0.011279,0.019204,0.01136,0.0,0.077043,0.0,0.0,0.0168,0.022605,0.005116,0.0,0.0,0.084978,0.0,0.005548,0.0,0.058486,0.0,0.0,0.005566,0.0,0.0,0.007486,0.032016,0.0,0.0,0.04758,0.0,0.057933,0.015165,0.0,0.0,0.065087,0.009915,0.008377,0.0,0.008632,0.009697,0.0,0.007913,0.067697,0.034997,0.021336,0.0,0.0,0.0,0.0,0.0,0.122398,0.0,0.032889,0.00793,0.008377,0.006131,0.066053,0.008377,0.018549,0.097435,0.0,0.001487,0.0,0.018826,0.0,0.0,0.0,0.097041,0.038151,0.025007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006729,0.0,0.020003,0.0,0.0,0.016778,0.011231,0.0,0.0,0.051281,0.0,0.0,0.0,0.015814,0.0,0.0,0.010473,0.012948,0.013861,0.0,0.014602,0.007488,0.047072,0.0,0.052894,0.101431,0.007,0.02941,0.008377,0.005782,0.0,0.0,0.016257,0.016393,0.0,0.013805,0.0,0.044603,0.0,0.0,0.017595,0.007654,0.03355,0.0,0.008377,0.00368,0.0,0.019433,0.0,0.196345,0.0,0.055979,0.009812,0.047759,0.136422,0.0,0.017504,0.0,0.007411,0.0,0.02096,0.0,0.0,0.011287,0.0,0.113116,0.013487,0.016563,0.009099,0.005517,0.06402,0.0,0.000611,0.0,0.00374,0.0,0.0,0.008377,0.0,0.028914,0.0,0.0,0.0,0.01071,0.032112,0.0,0.008377,0.0,0.0,0.011302,0.0,0.009521,0.002582,0.007781,0.0,0.00651,0.005753,0.015388,0.01507,0.131028,0.01318,0.017044,0.026648,0.06112,0.034536,0.014138,0.062394,0.005636,0.013725,0.00597,0.015839,0.0,0.005698,0.0,0.0,0.006644,0.014468,0.021642,0.056534,0.019289,0.066928,0.0174,0.0,0.0,0.03978,0.0,0.020134,0.0,0.083016,0.003784,0.0,0.0,0.0,0.009149,0.0,0.0,0.0,0.01685,0.109328,0.036725,0.0,0.0,0.0,0.013015,0.017766,0.007063,0.002705,0.002323,0.023004,0.0,0.0,0.0,0.0,0.036639,0.0,0.012965,0.0,0.0,0.0,0.0,0.0,0.050272,0.0,0.130371,0.028305,0.041297,0.0,0.034208,0.010151,0.011492,0.005642,0.0,0.005116,0.023577,0.008377,0.092279,0.00518,0.045702,0.0,0.005796,0.0,0.0,0.0,0.0,0.058165,0.033753,0.067122,0.017968,0.0,0.090395,0.029997,0.0,0.0,0.031325,0.0,0.065695,0.0,0.015864,0.0,0.007947,0.002742,0.0,0.019208,0.004321,0.005911,0.0,0.0,0.0,0.0,0.10588,0.0,0.013296,0.017775,0.005077,0.022876,0.012873,0.01694,0.0,0.0,0.004668,0.017621,0.0,0.011322,0.044717,0.015161,0.007926,0.06843,0.0,0.0,0.054732,0.056301,0.0,0.017373,0.078484,0.0,0.0,0.009697,0.0,0.006261,0.0,0.013751,0.010832,0.0,0.038347,0.0,0.0,0.038128,0.013024,0.04689,0.057316,0.004681,0.0,0.038638,0.018566,0.0,0.0,0.0,0.0,0.049367,0.0,0.00543,0.040499,0.0,0.0,0.007162,0.070553,0.03355,0.008377,0.015469,0.0,0.011219,0.0,0.0,0.0,0.0,0.013291,0.008377,0.0,0.0,0.007429,0.058666,0.012949,0.022648,0.068266,0.0,0.0,0.055498,0.031035,0.037741,0.034184,0.008034,0.0,0.0,0.005947,0.0,0.140667,0.0,0.020038,0.027728,0.006982,0.0,0.0,0.022576,0.005008,0.0086,0.0,0.025263,0.0,0.004279,0.007727,0.120333,0.006729,0.009481,0.016048,0.0,0.014423,0.012809,0.01091,0.0,0.0,0.0,0.0,0.0,0.008214,0.051438,0.007068,0.0,0.0,0.0,0.012632,0.0,0.122096,0.0,0.0,0.019866,0.0,0.012953,0.0,0.0,0.009748,0.0,0.0,0.017016,0.062161,0.015608,0.007288,0.0,0.012497,0.002818,0.022454,0.013899,0.0,0.0,0.0,0.023613,0.0,0.010351,0.013221,0.02185,0.0,0.0,0.006552,0.024416,0.0,0.03908,0.010118,0.001936,0.0,0.0,0.0,0.0,0.00794,0.073068,0.014536,0.0,0.00706,0.008978,0.0,0.0,0.0,0.0,0.0,0.011673,0.021113,0.034405,0.019704,0.008997,0.0,0.085378,0.008114,0.0,0.042701,0.006014,0.085263,0.0,0.0,0.0,0.058669,0.014609,0.017662,0.005749,0.0,0.0,0.047589,0.007398
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.0,0.0,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.00053,7.9e-05,0.000152,8.9e-05,0.000164,0.000124,0.000344,1e-06,1.7e-05,6e-05,6.1e-05,1.5e-05,1e-05,0.000142,1.1e-05,5.6e-05,6.9e-05,0.000128,1.6e-05,1.3e-05,0.009355,0.002732,0.008755,0.006236,0.026456,0.005595,0.001072,0.018925,0.009896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,21.0,0.0,0.616026,0.1,0.0,0.0,0.0,0.0,0.857143,2.5,2.0,-62.339998,-1.9,1.6,0.0,-2.8,-3.06,1.0,0.6,0.000639,8.7e-05,0.000166,9.9e-05,0.000175,0.000136,0.001143,2e-06,6.7e-05,0.000125,9.8e-05,3.1e-05,3.2e-05,0.000382,1.6e-05,8.5e-05,0.000108,0.000181,2.7e-05,1.8e-05,0.018978,0.00501,0.025172,0.010577,0.080779,0.02411,0.01176,0.032187,0.061176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,30.0,0.0,0.708976,0.134146,0.0,0.0,0.0,0.0,1.0,3.5,2.5,31.219999,0.5,2.4,0.0,0.4,0.45,2.0,6.33,0.000919,9.4e-05,0.000178,0.000109,0.000184,0.000143,0.004908,6e-06,0.000234,0.000321,0.000272,6.1e-05,9e-05,0.00099,2.4e-05,0.000149,0.000171,0.000361,3.8e-05,2.4e-05,0.042928,0.00918,0.056714,0.017258,0.137683,0.100285,0.135028,0.047034,0.242673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10%,43.0,0.0,0.735294,0.146497,0.013605,0.009153,0.0,0.0,1.071429,4.666667,3.5,44.240002,1.8,3.2,0.0,2.1,2.41,2.5,6.92,0.001692,0.0001,0.000205,0.000118,0.000212,0.00016,0.014497,1.2e-05,0.000478,0.000549,0.000847,9.4e-05,0.000167,0.003102,3.7e-05,0.000326,0.000279,0.000897,5.8e-05,3.9e-05,0.083281,0.015616,0.098657,0.025333,0.181829,0.276281,0.304556,0.058789,0.408806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20%,67.0,0.0,0.75641,0.15873,0.021739,0.017241,0.010753,0.0,1.166667,7.0,5.0,55.740002,3.4,5.0,0.0,4.2,4.35,3.8,7.6,0.008785,0.000114,0.000459,0.000159,0.000499,0.000253,0.057587,2.8e-05,0.001396,0.001137,0.003741,0.000167,0.000406,0.020477,0.000101,0.001297,0.000679,0.003822,0.000161,0.000133,0.215689,0.039328,0.209737,0.04849,0.255771,0.694808,0.499447,0.07969,0.571056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30%,97.0,0.0,0.768421,0.16527,0.026846,0.022472,0.014545,0.0,1.230769,9.0,6.5,62.68,4.5,6.55,0.0,5.6,5.63,4.9,8.08,0.029115,0.000151,0.001053,0.000277,0.001357,0.000471,0.149158,5.9e-05,0.003715,0.001965,0.012264,0.000265,0.000873,0.084494,0.000256,0.003586,0.001366,0.010728,0.000375,0.000367,0.416713,0.093963,0.355629,0.092794,0.338882,0.869272,0.62351,0.103245,0.687952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40%,136.0,0.0,0.777202,0.170213,0.031746,0.027174,0.017544,0.0,1.285714,11.0,8.0,68.059998,5.6,8.01,0.0,6.8,6.56,6.0,8.5,0.090577,0.000303,0.003312,0.000487,0.004756,0.001095,0.31845,0.000124,0.010268,0.003318,0.038015,0.000402,0.001805,0.241696,0.000606,0.009033,0.002405,0.024957,0.000713,0.000869,0.630792,0.235449,0.533436,0.185548,0.434913,0.927488,0.723819,0.131635,0.773001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 983 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    repeat_char_frac              14251 non-null  float32
 8    repeat_substring_frac         14251 non-null  float32
 9    syllables_per_word            14251 non-null  float32
 10   syllables_per_sent            14251 non-null  float32
 11   words_per_sent                14251 non-null  float32
 12   flesch_reading_ease           14251 non-null

In [22]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 794 ms
