In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple, Callable
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger': 

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_parquet("input/pre_val.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
dtypes: object(3)
memory usage: 334.1+ KB
Wall time: 66.8 ms


# Character level features

In [5]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 14 ms


In [6]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
}

In [7]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23482.40it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 21936.52it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23635.88it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 20827.24it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 24095.82it/s]


# Textstat features

In [8]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


textstat_fns: Dict[str, Callable] = {
    "syllables_per_word": syllables_per_word,
    "syllables_per_sent": syllables_per_sent,
    "words_per_sent": words_per_sent,
    "flesch_reading_ease": flesch_reading_ease,
    "flesch_kincaid_grade": flesch_kincaid_grade,
    "gunning_fog": gunning_fog,
    "smog_index": smog_index,
    "automated_readability_index": automated_readability_index,
    "coleman_liau_index": coleman_liau_index,
    "linsear_write_formula": linsear_write_formula,
    "dale_chall_readability_score": dale_chall_readability_score,
}

In [9]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 4950.12it/s]


In [10]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 49207.53it/s]


In [11]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 27295.25it/s]


In [12]:
for col, fn in textstat_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 89231.72it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 92705.13it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 91943.23it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 7016.83it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7500.52it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5655.30it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7830.22it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 18313.29it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 15850.78it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8976.44it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5986.71it/s]


# TweetEval labels

In [13]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.8250, 0.1750],
        [0.5796, 0.4204],
        [0.4716, 0.5284],
        [0.7322, 0.2678],
        [0.4915, 0.5085],
        [0.7496, 0.2504],
        [0.8696, 0.1304],
        [0.8217, 0.1783],
        [0.3459, 0.6541],
        [0.1238, 0.8762]])


100%|█████████████████████████████████████████████| 223/223 [10:12<00:00,  2.75s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.3877, 0.1589, 0.2741, 0.1793],
        [0.5259, 0.2425, 0.0625, 0.1691],
        [0.9694, 0.0042, 0.0112, 0.0152],
        [0.5685, 0.0124, 0.0357, 0.3834],
        [0.8919, 0.0098, 0.0405, 0.0579],
        [0.9556, 0.0049, 0.0253, 0.0142],
        [0.4261, 0.0168, 0.0754, 0.4817],
        [0.3232, 0.0613, 0.3565, 0.2590],
        [0.9771, 0.0088, 0.0063, 0.0078],
        [0.9784, 0.0061, 0.0071, 0.0084]])


100%|█████████████████████████████████████████████| 223/223 [10:12<00:00,  2.75s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.4363, 0.4498, 0.1139],
        [0.5382, 0.4354, 0.0265],
        [0.9155, 0.0784, 0.0061],
        [0.8772, 0.1129, 0.0100],
        [0.8227, 0.1676, 0.0097],
        [0.8413, 0.1485, 0.0101],
        [0.6315, 0.3382, 0.0303],
        [0.3860, 0.5919, 0.0221],
        [0.7695, 0.2081, 0.0224],
        [0.9442, 0.0513, 0.0045]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.9316, 0.0684],
        [0.7094, 0.2906],
        [0.9144, 0.0856],
        [0.8095, 0.1905],
        [0.6458, 0.3542],
        [0.9463, 0.0537],
        [0.6868, 0.3132],
        [0.8540, 0.1460],
        [0.8311, 0.1689],
        [0.1765, 0.8235]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.5440, 0.3114, 0.1446],
        [0.7966, 0.1856, 0.0178],
        [0.8592, 0.1063, 0.0346],
        [0.7715, 0.1604, 0.0681],
        [0.9179, 0.0682, 0.0139],
        [0.8190, 0.1488, 0.0323],
        [0.7290, 0.2352, 0.0358],
        [0.0568, 0.8831, 0.0601],
        [0.8640, 0.1115, 0.0244],
        [0.9174, 0.0668, 0.0158]])


# HateBert labels

In [14]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [15]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.37 s


In [16]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:20<00:00,  5.54s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.7103, 0.2897],
        [0.5841, 0.4159],
        [0.2580, 0.7420],
        [0.5009, 0.4991],
        [0.4471, 0.5529],
        [0.9110, 0.0890],
        [0.9289, 0.0711],
        [0.8173, 0.1827],
        [0.0573, 0.9427],
        [0.0291, 0.9709]])


100%|█████████████████████████████████████████████| 112/112 [10:20<00:00,  5.54s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.9040, 0.0960],
        [0.9509, 0.0491],
        [0.3422, 0.6578],
        [0.8281, 0.1719],
        [0.7559, 0.2441],
        [0.9315, 0.0685],
        [0.9834, 0.0166],
        [0.9569, 0.0431],
        [0.2401, 0.7599],
        [0.0316, 0.9684]])


100%|█████████████████████████████████████████████| 112/112 [10:20<00:00,  5.54s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.6722, 0.3278],
        [0.3455, 0.6545],
        [0.5956, 0.4044],
        [0.5358, 0.4642],
        [0.4315, 0.5685],
        [0.9215, 0.0785],
        [0.9674, 0.0326],
        [0.9595, 0.0405],
        [0.1594, 0.8406],
        [0.0408, 0.9592]])


100%|█████████████████████████████████████████████| 112/112 [10:20<00:00,  5.54s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.7386, 0.2614],
        [0.7023, 0.2977],
        [0.6564, 0.3436],
        [0.9207, 0.0793],
        [0.8562, 0.1438],
        [0.9738, 0.0262],
        [0.9785, 0.0215],
        [0.9744, 0.0256],
        [0.2881, 0.7119],
        [0.0493, 0.9507]])





# Detoxify labels

In [17]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:54<00:00, 618.25s/it]


In [18]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [19]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-01-24 11:49:54,559 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-24 11:49:54,559 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-24 11:49:54,559 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-24 11:49:54,559 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [20]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 291 ms


# Review data

In [21]:
cols = ["length"]
cols += list(char_fns.keys())
cols += list(textstat_fns.keys())
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,408.163357,0.009762,0.771957,0.171534,0.046748,0.087201,1.366291,18.951092,13.336779,63.72102,9.542881,11.092672,4.355245,13.163315,9.301051,9.627413,9.582322,0.453443,0.065879,0.293508,0.028105,0.271399,0.062513,0.527948,0.045701,0.29349,0.064029,0.302995,0.017865,0.10824,0.503412,0.060658,0.259304,0.057887,0.26646,0.02698,0.141233,0.637673,0.493144,0.598483,0.453358,0.538395,0.809964,0.71277,0.244864,0.740785
std,690.033702,0.032195,0.056141,0.024243,0.043803,0.177849,2.770914,40.468548,27.80942,315.881012,48.319965,21.886118,5.248013,90.88707,97.967644,9.27658,3.426401,0.42742,0.152208,0.393592,0.121598,0.365032,0.176504,0.401281,0.119724,0.391343,0.164516,0.359976,0.094369,0.236823,0.414562,0.155446,0.372454,0.165578,0.350691,0.110573,0.284004,0.344879,0.399083,0.331899,0.380141,0.269461,0.276167,0.261402,0.215286,0.223971
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.00053,7.9e-05,0.000152,8.9e-05,0.000164,0.000127,0.000352,1e-06,1.7e-05,6e-05,6.1e-05,1.5e-05,1e-05,0.000126,1.1e-05,5.2e-05,6.8e-05,0.000128,1.7e-05,1.3e-05,0.009207,0.002739,0.008755,0.005975,0.026456,0.00561,0.001072,0.012607,0.009896
1%,22.0,0.0,0.546537,0.095238,0.0,0.0,0.857143,2.5,2.0,-69.965,-1.9,1.6,0.0,-2.75,-2.895,1.0,1.065,0.000656,8.7e-05,0.000165,9.9e-05,0.000175,0.000136,0.001169,2e-06,6.8e-05,0.000125,0.000101,3.3e-05,3.5e-05,0.0004,1.6e-05,8.5e-05,0.000109,0.000182,2.7e-05,1.8e-05,0.018584,0.004951,0.024931,0.010681,0.079432,0.024941,0.011482,0.031449,0.062903
5%,31.0,0.0,0.681818,0.130435,0.0,0.0,1.0,3.5,2.666667,30.200001,0.5,2.4,0.0,0.6,0.65,2.0,6.41,0.000928,9.3e-05,0.000178,0.000109,0.000185,0.000144,0.005086,7e-06,0.000246,0.000318,0.000272,6.2e-05,9.2e-05,0.001032,2.4e-05,0.000146,0.000172,0.000357,3.7e-05,2.4e-05,0.04247,0.009006,0.056709,0.01709,0.134635,0.097473,0.130911,0.046584,0.24336
10%,44.0,0.0,0.721893,0.142857,0.014085,0.009174,1.071429,4.666667,3.5,43.43,1.8,3.2,0.0,2.3,2.6,2.5,7.01,0.001699,0.0001,0.000205,0.000118,0.000212,0.00016,0.01442,1.2e-05,0.000495,0.000547,0.000815,9.4e-05,0.000169,0.003116,3.5e-05,0.000309,0.000279,0.000856,5.6e-05,3.8e-05,0.084651,0.015021,0.096487,0.024892,0.177724,0.270146,0.296957,0.058163,0.399844
20%,69.0,0.0,0.75,0.156734,0.022472,0.017167,1.166667,7.0,5.0,55.41,3.4,5.02,0.0,4.4,4.53,3.833333,7.71,0.008414,0.000113,0.000447,0.000158,0.000485,0.000249,0.05483,2.7e-05,0.001371,0.001093,0.003582,0.000165,0.000404,0.019576,9.9e-05,0.001274,0.000692,0.003668,0.000154,0.000128,0.209011,0.037272,0.208388,0.047384,0.253679,0.69285,0.498913,0.07881,0.563033
30%,99.0,0.0,0.763636,0.163714,0.027778,0.022272,1.230769,9.0,6.5,62.380001,4.5,6.56,0.0,5.8,5.76,5.0,8.19,0.028025,0.000147,0.000996,0.000273,0.001305,0.000458,0.14226,5.7e-05,0.003603,0.0019,0.011528,0.00026,0.00085,0.082063,0.000259,0.00359,0.001384,0.010308,0.000369,0.000364,0.406535,0.089673,0.350696,0.090451,0.333587,0.868509,0.623632,0.100949,0.680337
40%,137.0,0.0,0.773987,0.169054,0.032967,0.027027,1.285714,11.0,8.0,67.790001,5.6,8.02,0.0,7.0,6.73,6.0,8.62,0.086309,0.00029,0.003066,0.000485,0.004415,0.001054,0.305504,0.000121,0.009863,0.003266,0.034951,0.000394,0.001747,0.238874,0.00062,0.009227,0.002452,0.023675,0.000715,0.000863,0.615935,0.221322,0.522572,0.179317,0.422991,0.927249,0.722107,0.128178,0.767684


In [22]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    syllables_per_word            14251 non-null  float32
 8    syllables_per_sent            14251 non-null  float32
 9    words_per_sent                14251 non-null  float32
 10   flesch_reading_ease           14251 non-null  float32
 11   flesch_kincaid_grade          14251 non-null  float32
 12   gunning_fog                   14251 non-null

In [23]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 509 ms
