In [1]:
import os
import gc
import json
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
import textstat
from tqdm import tqdm
from typing import List, Set, Dict, Tuple, NamedTuple, Callable, Any
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    data_dir: str = "data/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
    vocab_file: str = f"{data_dir}vocab.json"
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', data_dir='data/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_ro

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_parquet("input/pre_val.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
 3   text3   14251 non-null  object
dtypes: object(4)
memory usage: 445.5+ KB
Wall time: 88 ms


# Character level features

In [5]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 14 ms


In [6]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


def repeat_char_frac(row) -> float:
    return mylib.repeat_char_frac(row["text1"])


def repeat_substring_frac(row) -> float:
    return mylib.repeat_substring_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
    "repeat_char_frac": repeat_char_frac,
    "repeat_substring_frac": repeat_substring_frac,
}

In [7]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22155.77it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 21808.61it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22980.39it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 20955.69it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23554.71it/s]


repeat_char_frac


100%|██████████████████████████████████████| 14251/14251 [00:01<00:00, 11457.82it/s]


repeat_substring_frac


100%|████████████████████████████████████████| 14251/14251 [02:05<00:00, 113.26it/s]


# Textstat features

In [8]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


preprocess_fns: List[Tuple[str, Callable, Any]] = [
    ("syllable_count", syllable_count, np.int32),
    ("lexicon_count", lexicon_count, np.int32),
    ("sentence_count", sentence_count, np.int32),
]
textstat_fns: List[Tuple[str, Callable, Any]] = [
    ("syllables_per_word", syllables_per_word, np.float32),
    ("syllables_per_sent", syllables_per_sent, np.float32),
    ("words_per_sent", words_per_sent, np.float32),
    ("flesch_reading_ease", flesch_reading_ease, np.float32),
    ("flesch_kincaid_grade", flesch_kincaid_grade, np.float32),
    ("gunning_fog", gunning_fog, np.float32),
    ("smog_index", smog_index, np.float32),
    ("automated_readability_index", automated_readability_index, np.float32),
    ("coleman_liau_index", coleman_liau_index, np.float32),
    ("linsear_write_formula", linsear_write_formula, np.float32),
    ("dale_chall_readability_score", dale_chall_readability_score, np.float32),
]

In [9]:
for col, fn, dtype in preprocess_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)
for col, fn, dtype in textstat_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)

syllable_count


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 4985.29it/s]


lexicon_count


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 49482.62it/s]


sentence_count


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 26975.83it/s]


syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 91073.82it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 90196.62it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 91314.80it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 6955.76it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7190.13it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5678.83it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7678.70it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 18130.84it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 15672.61it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8802.19it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5789.09it/s]


# TF-IDF features

In [10]:
vocabulary, idf = [], []
with open(conf.vocab_file) as f:
    tmp = json.load(f)
    vocabulary = tmp["term"]
    idf = tmp["idf"]
print(f"len(vocab)={len(vocabulary)}\n{vocabulary}")
assert len(vocabulary) == len(idf)

len(vocab)=576
['african', 'african american', 'alabama hot pocket', 'alaskan pipeline', 'american', 'anal', 'analplug', 'analsex', 'anilingus', 'anus', 'apeshit', 'arse', 'arsehole', 'asian', 'ass', 'assassin', 'asshole', 'assmunch', 'atheist', 'auto erotic', 'autoerotic', 'babeland', 'baby batter', 'baby juice', 'ball gag', 'ball gravy', 'ball kicking', 'ball licking', 'ball sack', 'ball sucking', 'balls', 'bangbros', 'bangbus', 'bareback', 'barely legal', 'barenaked', 'bastard', 'bastardo', 'bastinado', 'bbw', 'bdsm', 'beaner', 'beaners', 'beastiality', 'beaver cleaver', 'beaver lips', 'bestiality', 'bewb', 'big black', 'big breasts', 'big knockers', 'big tits', 'bimbo', 'bimbos', 'birdlock', 'bisexual', 'bitch', 'bitches', 'black', 'black cock', 'blind', 'blonde action', 'blonde on blonde action', 'bloody', 'bloodyhell', 'blow', 'blow job', 'blow your load', 'blowjob', 'blue waffle', 'blumpkin', 'bollocks', 'bondage', 'boner', 'boob', 'boobies', 'boobs', 'booty call', 'boy', 'brown

In [11]:
%%time
vec = TfidfVectorizer(vocabulary=vocabulary, ngram_range=(1, 3), analyzer="word")
vec.idf_ = idf
x = vec.transform(df["text3"])
print(f"x.shape={x.shape}\n{x[0]}")
ti_features = [f"ti{i:04d}" for i in range(x.shape[1])]
df[ti_features] = scipy.sparse.csr_matrix(x).todense()
df[ti_features] = df[ti_features].astype(np.float32)
df = df.copy()  # defragment

x.shape=(14251, 576)



  self[col] = igetitem(value, i)


Wall time: 7.18 s


# TweetEval labels

In [12]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:30<00:00,  2.83s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.7251, 0.2749],
        [0.2105, 0.7895],
        [0.2070, 0.7930],
        [0.0724, 0.9276],
        [0.7127, 0.2873],
        [0.1416, 0.8584],
        [0.4590, 0.5410],
        [0.5828, 0.4172],
        [0.9216, 0.0784],
        [0.5497, 0.4503]])


100%|█████████████████████████████████████████████| 223/223 [10:41<00:00,  2.88s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.2103, 0.1683, 0.5975, 0.0238],
        [0.9517, 0.0076, 0.0105, 0.0301],
        [0.9804, 0.0055, 0.0078, 0.0063],
        [0.9796, 0.0079, 0.0061, 0.0064],
        [0.8439, 0.0097, 0.1081, 0.0384],
        [0.9812, 0.0051, 0.0069, 0.0069],
        [0.2046, 0.4324, 0.2127, 0.1503],
        [0.8628, 0.0068, 0.0643, 0.0661],
        [0.2001, 0.2043, 0.3457, 0.2499],
        [0.9386, 0.0064, 0.0093, 0.0457]])


100%|█████████████████████████████████████████████| 223/223 [11:10<00:00,  3.01s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.1966, 0.4262, 0.3772],
        [0.9633, 0.0322, 0.0045],
        [0.9426, 0.0517, 0.0057],
        [0.9685, 0.0249, 0.0066],
        [0.4167, 0.5050, 0.0782],
        [0.9531, 0.0404, 0.0065],
        [0.3691, 0.5979, 0.0330],
        [0.5026, 0.4448, 0.0526],
        [0.1364, 0.8089, 0.0547],
        [0.9701, 0.0258, 0.0041]])


100%|█████████████████████████████████████████████| 223/223 [10:24<00:00,  2.80s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.6977, 0.3023],
        [0.8382, 0.1618],
        [0.2674, 0.7326],
        [0.9500, 0.0500],
        [0.8954, 0.1046],
        [0.9335, 0.0665],
        [0.9446, 0.0554],
        [0.9208, 0.0792],
        [0.5923, 0.4077],
        [0.9177, 0.0823]])


100%|█████████████████████████████████████████████| 223/223 [10:21<00:00,  2.79s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.2148, 0.1927, 0.5925],
        [0.9559, 0.0302, 0.0139],
        [0.8060, 0.0965, 0.0974],
        [0.9347, 0.0414, 0.0239],
        [0.7078, 0.2347, 0.0575],
        [0.9491, 0.0382, 0.0127],
        [0.2283, 0.6574, 0.1143],
        [0.5970, 0.2949, 0.1081],
        [0.2419, 0.6809, 0.0771],
        [0.9267, 0.0536, 0.0197]])


# HateBert labels

In [13]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [14]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.14 s


In [15]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:28<00:00,  5.61s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.9485, 0.0515],
        [0.0466, 0.9534],
        [0.0291, 0.9709],
        [0.0282, 0.9718],
        [0.1630, 0.8370],
        [0.0310, 0.9690],
        [0.8395, 0.1605],
        [0.7020, 0.2980],
        [0.9744, 0.0256],
        [0.6909, 0.3091]])


100%|█████████████████████████████████████████████| 112/112 [10:21<00:00,  5.55s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.9914, 0.0086],
        [0.0324, 0.9676],
        [0.0391, 0.9609],
        [0.0298, 0.9702],
        [0.3622, 0.6378],
        [0.0367, 0.9633],
        [0.9820, 0.0180],
        [0.9447, 0.0553],
        [0.9918, 0.0082],
        [0.8776, 0.1224]])


100%|█████████████████████████████████████████████| 112/112 [10:29<00:00,  5.62s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.7466, 0.2534],
        [0.0398, 0.9602],
        [0.0402, 0.9598],
        [0.0401, 0.9599],
        [0.4747, 0.5253],
        [0.0709, 0.9291],
        [0.5831, 0.4169],
        [0.8962, 0.1038],
        [0.7860, 0.2140],
        [0.6886, 0.3114]])


100%|█████████████████████████████████████████████| 112/112 [10:32<00:00,  5.65s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.9604, 0.0396],
        [0.0293, 0.9707],
        [0.0343, 0.9657],
        [0.0236, 0.9764],
        [0.9090, 0.0910],
        [0.1022, 0.8978],
        [0.9436, 0.0564],
        [0.9612, 0.0388],
        [0.9552, 0.0448],
        [0.9603, 0.0397]])





# Detoxify labels

In [16]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:10<00:00, 603.49s/it]


In [17]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [18]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-02-04 15:41:35,973 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-04 15:41:35,973 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-04 15:41:35,973 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-02-04 15:41:35,973 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [19]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

Wall time: 339 ms


# Review data

In [20]:
cols = ["length"]
cols += list(char_fns.keys())
cols += [x[0] for x in textstat_fns]
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
cols += ti_features
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,repeat_char_frac,repeat_substring_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg,ti0000,ti0001,ti0002,ti0003,ti0004,ti0005,ti0006,ti0007,ti0008,ti0009,ti0010,ti0011,ti0012,ti0013,ti0014,ti0015,ti0016,ti0017,ti0018,ti0019,ti0020,ti0021,ti0022,ti0023,ti0024,ti0025,ti0026,ti0027,ti0028,ti0029,ti0030,ti0031,ti0032,ti0033,ti0034,ti0035,ti0036,ti0037,ti0038,ti0039,ti0040,ti0041,ti0042,ti0043,ti0044,ti0045,ti0046,ti0047,ti0048,ti0049,ti0050,ti0051,ti0052,ti0053,ti0054,ti0055,ti0056,ti0057,ti0058,ti0059,ti0060,ti0061,ti0062,ti0063,ti0064,ti0065,ti0066,ti0067,ti0068,ti0069,ti0070,ti0071,ti0072,ti0073,ti0074,ti0075,ti0076,ti0077,ti0078,ti0079,ti0080,ti0081,ti0082,ti0083,ti0084,ti0085,ti0086,ti0087,ti0088,ti0089,ti0090,ti0091,ti0092,ti0093,ti0094,ti0095,ti0096,ti0097,ti0098,ti0099,ti0100,ti0101,ti0102,ti0103,ti0104,ti0105,ti0106,ti0107,ti0108,ti0109,ti0110,ti0111,ti0112,ti0113,ti0114,ti0115,ti0116,ti0117,ti0118,ti0119,ti0120,ti0121,ti0122,ti0123,ti0124,ti0125,ti0126,ti0127,ti0128,ti0129,ti0130,ti0131,ti0132,ti0133,ti0134,ti0135,ti0136,ti0137,ti0138,ti0139,ti0140,ti0141,ti0142,ti0143,ti0144,ti0145,ti0146,ti0147,ti0148,ti0149,ti0150,ti0151,ti0152,ti0153,ti0154,ti0155,ti0156,ti0157,ti0158,ti0159,ti0160,ti0161,ti0162,ti0163,ti0164,ti0165,ti0166,ti0167,ti0168,ti0169,ti0170,ti0171,ti0172,ti0173,ti0174,ti0175,ti0176,ti0177,ti0178,ti0179,ti0180,ti0181,ti0182,ti0183,ti0184,ti0185,ti0186,ti0187,ti0188,ti0189,ti0190,ti0191,ti0192,ti0193,ti0194,ti0195,ti0196,ti0197,ti0198,ti0199,ti0200,ti0201,ti0202,ti0203,ti0204,ti0205,ti0206,ti0207,ti0208,ti0209,ti0210,ti0211,ti0212,ti0213,ti0214,ti0215,ti0216,ti0217,ti0218,ti0219,ti0220,ti0221,ti0222,ti0223,ti0224,ti0225,ti0226,ti0227,ti0228,ti0229,ti0230,ti0231,ti0232,ti0233,ti0234,ti0235,ti0236,ti0237,ti0238,ti0239,ti0240,ti0241,ti0242,ti0243,ti0244,ti0245,ti0246,ti0247,ti0248,ti0249,ti0250,ti0251,ti0252,ti0253,ti0254,ti0255,ti0256,ti0257,ti0258,ti0259,ti0260,ti0261,ti0262,ti0263,ti0264,ti0265,ti0266,ti0267,ti0268,ti0269,ti0270,ti0271,ti0272,ti0273,ti0274,ti0275,ti0276,ti0277,ti0278,ti0279,ti0280,ti0281,ti0282,ti0283,ti0284,ti0285,ti0286,ti0287,ti0288,ti0289,ti0290,ti0291,ti0292,ti0293,ti0294,ti0295,ti0296,ti0297,ti0298,ti0299,ti0300,ti0301,ti0302,ti0303,ti0304,ti0305,ti0306,ti0307,ti0308,ti0309,ti0310,ti0311,ti0312,ti0313,ti0314,ti0315,ti0316,ti0317,ti0318,ti0319,ti0320,ti0321,ti0322,ti0323,ti0324,ti0325,ti0326,ti0327,ti0328,ti0329,ti0330,ti0331,ti0332,ti0333,ti0334,ti0335,ti0336,ti0337,ti0338,ti0339,ti0340,ti0341,ti0342,ti0343,ti0344,ti0345,ti0346,ti0347,ti0348,ti0349,ti0350,ti0351,ti0352,ti0353,ti0354,ti0355,ti0356,ti0357,ti0358,ti0359,ti0360,ti0361,ti0362,ti0363,ti0364,ti0365,ti0366,ti0367,ti0368,ti0369,ti0370,ti0371,ti0372,ti0373,ti0374,ti0375,ti0376,ti0377,ti0378,ti0379,ti0380,ti0381,ti0382,ti0383,ti0384,ti0385,ti0386,ti0387,ti0388,ti0389,ti0390,ti0391,ti0392,ti0393,ti0394,ti0395,ti0396,ti0397,ti0398,ti0399,ti0400,ti0401,ti0402,ti0403,ti0404,ti0405,ti0406,ti0407,ti0408,ti0409,ti0410,ti0411,ti0412,ti0413,ti0414,ti0415,ti0416,ti0417,ti0418,ti0419,ti0420,ti0421,ti0422,ti0423,ti0424,ti0425,ti0426,ti0427,ti0428,ti0429,ti0430,ti0431,ti0432,ti0433,ti0434,ti0435,ti0436,ti0437,ti0438,ti0439,ti0440,ti0441,ti0442,ti0443,ti0444,ti0445,ti0446,ti0447,ti0448,ti0449,ti0450,ti0451,ti0452,ti0453,ti0454,ti0455,ti0456,ti0457,ti0458,ti0459,ti0460,ti0461,ti0462,ti0463,ti0464,ti0465,ti0466,ti0467,ti0468,ti0469,ti0470,ti0471,ti0472,ti0473,ti0474,ti0475,ti0476,ti0477,ti0478,ti0479,ti0480,ti0481,ti0482,ti0483,ti0484,ti0485,ti0486,ti0487,ti0488,ti0489,ti0490,ti0491,ti0492,ti0493,ti0494,ti0495,ti0496,ti0497,ti0498,ti0499,ti0500,ti0501,ti0502,ti0503,ti0504,ti0505,ti0506,ti0507,ti0508,ti0509,ti0510,ti0511,ti0512,ti0513,ti0514,ti0515,ti0516,ti0517,ti0518,ti0519,ti0520,ti0521,ti0522,ti0523,ti0524,ti0525,ti0526,ti0527,ti0528,ti0529,ti0530,ti0531,ti0532,ti0533,ti0534,ti0535,ti0536,ti0537,ti0538,ti0539,ti0540,ti0541,ti0542,ti0543,ti0544,ti0545,ti0546,ti0547,ti0548,ti0549,ti0550,ti0551,ti0552,ti0553,ti0554,ti0555,ti0556,ti0557,ti0558,ti0559,ti0560,ti0561,ti0562,ti0563,ti0564,ti0565,ti0566,ti0567,ti0568,ti0569,ti0570,ti0571,ti0572,ti0573,ti0574,ti0575
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,404.495825,0.003891,0.778295,0.172929,0.044886,0.088107,0.027886,0.018796,1.360076,18.846472,13.300895,64.286728,9.450614,11.062636,4.342566,12.748144,8.927725,9.590234,9.313502,0.453456,0.064607,0.294084,0.027572,0.271963,0.062425,0.529311,0.044723,0.292764,0.064443,0.308312,0.017978,0.107717,0.501251,0.058705,0.258191,0.058524,0.270354,0.027115,0.138774,0.6396,0.497501,0.601631,0.457546,0.540523,0.810584,0.713279,0.253815,0.74576,0.002708,0.000923,0.0,0.0,0.010235,0.002795,0.0,0.0,2.1e-05,0.00145,0.0,0.00203,0.000551,0.002133,0.022813,0.0,0.014466,6.3e-05,0.001335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3e-05,0.0,0.000185,0.0,0.0,0.0,3.1e-05,0.0,0.006323,5.6e-05,0.0,0.0,0.0,5.8e-05,0.0,0.0,0.0,0.0,7e-05,0.0,0.000158,0.0,0.0,0.0,1.8e-05,0.0,0.0,0.000874,0.017535,0.00031,0.013839,5.6e-05,0.002505,0.0,0.0,0.00356,0.0,0.005945,5.6e-05,4.7e-05,0.000267,0.0,4.1e-05,0.000321,0.000187,0.0,0.0,5.3e-05,0.0,0.0,0.008385,0.0,0.0,0.000367,0.000211,4.3e-05,0.0,0.0,0.008783,0.0,5.7e-05,0.0,0.003976,0.0,0.0,6.5e-05,0.0,0.0,0.0001,0.001337,0.0,0.0,0.002639,0.0,0.004222,0.0,0.0,0.0,0.005538,0.000117,7e-05,0.0,0.0001,0.000127,0.0,8.9e-05,0.006745,0.0,0.00161,0.000562,0.0,0.0,0.0,0.0,0.0,0.0,0.017377,0.0,0.001366,5.5e-05,7e-05,0.0,7.2e-05,0.00108,7e-05,0.000398,0.011788,0.0,1.2e-05,0.0,0.000404,0.0,0.0,0.0,0.013349,0.001725,0.000783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.6e-05,0.0,0.000535,0.0,0.0,0.000315,0.000164,0.0,0.0,0.000248,0.0,0.0,0.0,0.000375,0.0,0.0,0.000169,0.000214,0.000254,0.0,0.000272,6.3e-05,0.002925,0.0,0.0,0.016258,0.0,8.1e-05,0.001212,7e-05,7.8e-05,0.0,0.0,0.0,0.000291,0.0,0.000244,0.0,0.003645,0.0,0.0,0.000462,6.4e-05,0.0,0.000145,7e-05,3.1e-05,0.0,0.000467,0.0,0.059656,0.0,0.004266,0.000141,0.003405,0.028264,0.0,6.3e-05,0.0,6.2e-05,0.0,0.0,0.0,0.0,0.000134,0.0,0.017099,0.00026,0.000466,0.000142,0.0,4.6e-05,0.005629,5e-06,0.0,0.0,3.1e-05,0.0,0.0,7e-05,0.0,0.001406,0.0,0.0,0.0,0.000126,0.001454,0.0,7e-05,0.0,0.0,2.2e-05,0.0,0.000113,2.2e-05,6.5e-05,0.0,5.5e-05,4.8e-05,0.00028,0.000293,0.021671,0.000224,0.000479,0.000902,0.004655,0.00022,0.000232,0.006125,0.0,0.000108,0.0,0.0,0.0,7.8e-05,0.0,0.0,5.6e-05,0.0,0.000592,0.003834,0.000599,0.005397,0.000432,0.0,0.0,0.002009,0.0,0.000493,0.0,0.008268,4.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015638,0.001708,0.0,0.001315,0.0,0.0,0.000202,0.0,5.9e-05,2.4e-05,1.9e-05,0.000581,0.0,0.0,0.0,0.0,0.001951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003997,0.0,0.024549,0.001024,0.0,0.003173,0.0,0.001409,0.000146,0.00017,0.0,0.0,4.3e-05,0.000741,0.0,0.009999,5.7e-05,0.002885,0.0,0.0,0.0,0.0,0.0,0.0,0.00447,0.001423,0.005614,0.000462,0.0,0.009976,0.001056,0.0,0.0,0.0,0.0,0.005539,0.0,0.000295,7e-05,0.0,0.000111,0.0,0.0,0.000424,8.3e-05,5e-05,0.0,0.0,0.0,0.0,0.015546,0.0,0.0,0.0,0.0,0.000288,0.000382,0.0,0.000722,0.0,0.0,0.0,0.0,0.0,0.000257,0.0,0.000172,0.002484,0.0,6.6e-05,0.006071,0.0,0.0,0.004774,0.00395,0.0,0.000352,0.005428,0.0,0.0,0.0,0.000122,0.0,5e-05,0.0,0.000223,0.0,0.0,0.0,0.0,0.0,0.0,6e-05,0.004493,0.00414,3.9e-05,0.0,0.002072,0.000473,0.0,0.0,0.0,0.0,0.003439,0.0,0.0,0.002441,0.0,6e-05,0.0,0.006896,0.0,0.0,0.000184,0.001502,7e-05,0.000302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000224,7e-05,0.0,0.0,0.0,0.004121,0.000209,0.00089,0.006524,0.0,0.0,0.00463,0.001288,0.002632,0.001411,0.0,0.0,0.0,7.5e-05,0.0,0.030677,0.0,0.0,0.0,0.0,0.0,0.0,0.000593,4.1e-05,0.000106,0.0,0.000676,0.0,5.8e-05,6.5e-05,0.015525,8e-05,0.000107,0.000324,0.0,0.000239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003554,5.9e-05,0.0,0.0,0.0,0.000245,0.0,0.016776,0.0,0.0,0.0,0.000569,0.0,0.000227,0.0,0.0,0.000114,0.0,0.0,0.000352,0.004792,0.000319,0.0,0.0,0.0,3.2e-05,0.000353,0.0,0.0,0.000179,0.0,0.0,0.0,0.000101,0.0,0.0,0.000127,0.000998,0.0,0.0,5.5e-05,4.4e-05,0.0,0.000528,7e-05,1.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.011867,0.000323,0.0,8.3e-05,0.000106,0.0,0.0,0.0,0.0,0.0,0.000192,0.000362,0.000386,0.000475,9.9e-05,0.0,0.010192,0.000131,0.0,0.002771,0.0,0.016592,0.0,0.0,0.0,0.0,0.000246,0.0,0.0,0.0,0.0,0.003578,0.0,6.2e-05
std,686.044494,0.016081,0.047163,0.023382,0.043165,0.179211,0.045813,0.104194,2.745461,40.391994,27.804605,314.100159,48.095737,21.882164,5.233645,90.17897,97.308975,9.253856,3.154639,0.427223,0.148911,0.393717,0.120505,0.365127,0.176418,0.402319,0.116356,0.392156,0.165872,0.363142,0.094901,0.236483,0.414678,0.151742,0.372959,0.167616,0.353014,0.111837,0.282632,0.345512,0.39975,0.332228,0.381498,0.270357,0.275462,0.261218,0.220133,0.222764,0.042473,0.021404,0.0,0.0,0.087443,0.046822,0.0,0.0,0.002451,0.035431,0.0,0.040108,0.020871,0.040672,0.128602,0.0,0.106702,0.007467,0.033325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007508,0.0,0.012971,0.0,0.0,0.0,0.003684,0.0,0.0697,0.00667,0.0,0.0,0.0,0.006934,0.0,0.0,0.0,0.0,0.008377,0.0,0.009907,0.0,0.0,0.0,0.00212,0.0,0.0,0.022803,0.111947,0.014262,0.10381,0.006669,0.044597,0.0,0.0,0.0549,0.0,0.068249,0.006684,0.005623,0.015991,0.0,0.004923,0.017426,0.011729,0.0,0.0,0.006337,0.0,0.0,0.079375,0.0,0.0,0.0168,0.014508,0.005116,0.0,0.0,0.085605,0.0,0.006817,0.0,0.058728,0.0,0.0,0.00771,0.0,0.0,0.007486,0.031847,0.0,0.0,0.047938,0.0,0.059388,0.0,0.0,0.0,0.065395,0.009915,0.008377,0.0,0.008633,0.009697,0.0,0.007913,0.069259,0.0,0.035532,0.021403,0.0,0.0,0.0,0.0,0.0,0.0,0.123242,0.0,0.033265,0.006533,0.008377,0.0,0.006107,0.028882,0.008377,0.018549,0.098691,0.0,0.001487,0.0,0.019102,0.0,0.0,0.0,0.099049,0.038547,0.025124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006729,0.0,0.02028,0.0,0.0,0.016878,0.011473,0.0,0.0,0.014887,0.0,0.0,0.0,0.016103,0.0,0.0,0.010465,0.012948,0.013943,0.0,0.014801,0.007488,0.046758,0.0,0.0,0.111583,0.0,0.006901,0.032201,0.008377,0.006603,0.0,0.0,0.0,0.016393,0.0,0.013784,0.0,0.046331,0.0,0.0,0.017566,0.007654,0.0,0.011334,0.008377,0.00368,0.0,0.019574,0.0,0.20617,0.0,0.056433,0.009812,0.048681,0.13841,0.0,0.007559,0.0,0.007411,0.0,0.0,0.0,0.0,0.011287,0.0,0.114663,0.013492,0.016869,0.009098,0.0,0.005517,0.064482,0.000617,0.0,0.0,0.00374,0.0,0.0,0.008377,0.0,0.030675,0.0,0.0,0.0,0.01071,0.032362,0.0,0.008377,0.0,0.0,0.002648,0.0,0.00984,0.002645,0.007781,0.0,0.00651,0.005753,0.015422,0.015261,0.132152,0.013852,0.01725,0.026648,0.061524,0.013199,0.014138,0.068013,0.0,0.009498,0.0,0.0,0.0,0.00653,0.0,0.0,0.006644,0.0,0.021904,0.056584,0.02085,0.067066,0.017793,0.0,0.0,0.040143,0.0,0.02018,0.0,0.083457,0.004941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1095,0.036831,0.0,0.030994,0.0,0.0,0.013015,0.0,0.007063,0.00281,0.002323,0.023004,0.0,0.0,0.0,0.0,0.037019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051502,0.0,0.132288,0.029074,0.0,0.049746,0.0,0.034362,0.010151,0.011504,0.0,0.0,0.005116,0.024415,0.0,0.092773,0.005186,0.046132,0.0,0.0,0.0,0.0,0.0,0.0,0.057704,0.033736,0.067482,0.018062,0.0,0.090968,0.03011,0.0,0.0,0.0,0.0,0.066512,0.0,0.016158,0.008377,0.0,0.009391,0.0,0.0,0.019315,0.004983,0.005911,0.0,0.0,0.0,0.0,0.106897,0.0,0.0,0.0,0.0,0.013481,0.017984,0.0,0.023913,0.0,0.0,0.0,0.0,0.0,0.015415,0.0,0.012077,0.044884,0.0,0.007926,0.06908,0.0,0.0,0.054911,0.056591,0.0,0.017505,0.067291,0.0,0.0,0.0,0.009697,0.0,0.005953,0.0,0.013752,0.0,0.0,0.0,0.0,0.0,0.0,0.007194,0.059827,0.057643,0.004681,0.0,0.039421,0.018698,0.0,0.0,0.0,0.0,0.050369,0.0,0.0,0.041623,0.0,0.007162,0.0,0.070839,0.0,0.0,0.01156,0.033727,0.008377,0.015499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013684,0.008377,0.0,0.0,0.0,0.059104,0.012948,0.024719,0.068668,0.0,0.0,0.056857,0.031044,0.042951,0.034658,0.0,0.0,0.0,0.006476,0.0,0.146677,0.0,0.0,0.0,0.0,0.0,0.0,0.02285,0.004919,0.009119,0.0,0.018595,0.0,0.004482,0.007727,0.120639,0.006795,0.009481,0.016248,0.0,0.01446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052492,0.007068,0.0,0.0,0.0,0.013061,0.0,0.110439,0.0,0.0,0.0,0.019871,0.0,0.012953,0.0,0.0,0.009728,0.0,0.0,0.017023,0.062556,0.016069,0.0,0.0,0.0,0.003809,0.01661,0.0,0.0,0.012532,0.0,0.0,0.0,0.00875,0.0,0.0,0.008607,0.025004,0.0,0.0,0.006552,0.00372,0.0,0.021621,0.008377,0.001936,0.0,0.0,0.0,0.0,0.0,0.0,0.087707,0.014669,0.0,0.00706,0.008978,0.0,0.0,0.0,0.0,0.0,0.011669,0.016977,0.017089,0.019801,0.009036,0.0,0.085978,0.009489,0.0,0.043729,0.0,0.096767,0.0,0.0,0.0,0.0,0.014826,0.0,0.0,0.0,0.0,0.048621,0.0,0.007398
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.0,0.0,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.00053,7.9e-05,0.000152,8.9e-05,0.000164,0.000124,0.000344,1e-06,1.7e-05,5.7e-05,6.1e-05,1.5e-05,1e-05,0.000142,1.1e-05,5.6e-05,6.9e-05,0.000128,1.6e-05,1.3e-05,0.009355,0.002915,0.008755,0.006311,0.026456,0.005595,0.001072,0.018925,0.009896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1%,21.0,0.0,0.616026,0.1,0.0,0.0,0.0,0.0,0.857143,2.5,2.0,-62.339998,-1.9,1.6,0.0,-2.8,-3.06,1.0,0.6,0.000638,8.7e-05,0.000166,9.9e-05,0.000175,0.000136,0.001118,2e-06,6.1e-05,0.000121,9.7e-05,3.1e-05,3e-05,0.000381,1.6e-05,8.4e-05,0.000108,0.00018,2.7e-05,1.8e-05,0.018738,0.004983,0.024701,0.01065,0.08032,0.024533,0.011482,0.032187,0.061426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5%,30.0,0.0,0.708976,0.134146,0.0,0.0,0.0,0.0,1.0,3.5,2.5,31.219999,0.5,2.4,0.0,0.4,0.45,2.0,6.33,0.000902,9.3e-05,0.000178,0.000109,0.000184,0.000143,0.004588,6e-06,0.000225,0.000304,0.00026,6e-05,8.8e-05,0.000962,2.4e-05,0.000146,0.000169,0.000354,3.8e-05,2.4e-05,0.042534,0.009093,0.057148,0.017225,0.135846,0.099382,0.135426,0.047117,0.24346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10%,43.0,0.0,0.735294,0.146497,0.013605,0.009153,0.0,0.0,1.071429,4.666667,3.5,44.240002,1.8,3.2,0.0,2.1,2.41,2.5,6.92,0.001632,0.0001,0.000202,0.000118,0.000211,0.000159,0.013299,1.1e-05,0.000457,0.000525,0.000784,9.2e-05,0.000161,0.003006,3.5e-05,0.000311,0.000272,0.000855,5.6e-05,3.7e-05,0.082899,0.015376,0.098087,0.025102,0.179488,0.275754,0.299614,0.059038,0.409993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20%,67.0,0.0,0.75641,0.15873,0.021739,0.017241,0.010753,0.0,1.166667,7.0,5.0,55.740002,3.4,5.0,0.0,4.2,4.35,3.8,7.6,0.008378,0.000114,0.000445,0.000157,0.000488,0.000249,0.053169,2.6e-05,0.001316,0.001083,0.003497,0.000163,0.000388,0.019163,9.6e-05,0.001246,0.000661,0.003701,0.000154,0.000128,0.210326,0.038514,0.208715,0.047678,0.252734,0.69218,0.498913,0.08016,0.573763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30%,97.0,0.0,0.768421,0.16527,0.026846,0.022472,0.014545,0.0,1.230769,9.0,6.5,62.68,4.5,6.55,0.0,5.6,5.63,4.9,8.08,0.028229,0.000148,0.001001,0.000272,0.001302,0.00046,0.141701,5.6e-05,0.003462,0.001895,0.011569,0.000259,0.000824,0.079479,0.000244,0.003418,0.001322,0.010421,0.000365,0.000353,0.406858,0.090723,0.352795,0.091763,0.333905,0.8693,0.622235,0.103981,0.690566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40%,136.0,0.0,0.777202,0.170213,0.031746,0.027174,0.017544,0.0,1.285714,11.0,8.0,68.059998,5.6,8.01,0.0,6.8,6.56,6.0,8.5,0.086931,0.000292,0.00309,0.00048,0.004482,0.001061,0.306635,0.000118,0.009481,0.003202,0.035604,0.000395,0.001711,0.232045,0.000585,0.008615,0.002349,0.024282,0.000693,0.000819,0.619393,0.226476,0.530648,0.180844,0.427558,0.927613,0.723054,0.13303,0.775033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1009 columns):
 #     Column                        Non-Null Count  Dtype  
---    ------                        --------------  -----  
 0     text                          14251 non-null  object 
 1     length                        14251 non-null  int16  
 2     digit_frac                    14251 non-null  float32
 3     letter_frac                   14251 non-null  float32
 4     space_frac                    14251 non-null  float32
 5     punc_frac                     14251 non-null  float32
 6     upper_frac                    14251 non-null  float32
 7     repeat_char_frac              14251 non-null  float32
 8     repeat_substring_frac         14251 non-null  float32
 9     syllables_per_word            14251 non-null  float32
 10    syllables_per_sent            14251 non-null  float32
 11    words_per_sent                14251 non-null  float32
 12    flesch_reading_ease         

In [22]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 805 ms
