In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple, Callable
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger': 

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_parquet("input/pre_val.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
dtypes: object(3)
memory usage: 334.1+ KB
Wall time: 69 ms


# Character level features

In [5]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 8 ms


In [6]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
}

In [7]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23980.24it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22267.46it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23713.71it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 21659.97it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 24026.29it/s]


# Textstat features

In [8]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


textstat_fns: Dict[str, Callable] = {
    "syllables_per_word": syllables_per_word,
    "syllables_per_sent": syllables_per_sent,
    "words_per_sent": words_per_sent,
    "flesch_reading_ease": flesch_reading_ease,
    "flesch_kincaid_grade": flesch_kincaid_grade,
    "gunning_fog": gunning_fog,
    "smog_index": smog_index,
    "automated_readability_index": automated_readability_index,
    "coleman_liau_index": coleman_liau_index,
    "linsear_write_formula": linsear_write_formula,
    "dale_chall_readability_score": dale_chall_readability_score,
}

In [9]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5161.29it/s]


In [10]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 50125.01it/s]


In [11]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 28180.15it/s]


In [12]:
for col, fn in textstat_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 89562.50it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 90337.30it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 90944.54it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7222.32it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7583.60it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5953.94it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8034.67it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 18663.27it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 16023.14it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 9119.78it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 6065.59it/s]


# TweetEval labels

In [13]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:15<00:00,  2.76s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.7459, 0.2541],
        [0.3644, 0.6356],
        [0.7878, 0.2122],
        [0.7821, 0.2179],
        [0.5718, 0.4282],
        [0.8251, 0.1749],
        [0.1630, 0.8370],
        [0.1314, 0.8686],
        [0.1164, 0.8836],
        [0.2024, 0.7976]])


100%|█████████████████████████████████████████████| 223/223 [10:15<00:00,  2.76s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.9745, 0.0036, 0.0106, 0.0113],
        [0.0620, 0.8237, 0.0987, 0.0156],
        [0.4262, 0.1518, 0.2263, 0.1957],
        [0.1208, 0.7952, 0.0658, 0.0182],
        [0.0607, 0.7956, 0.1061, 0.0375],
        [0.9635, 0.0043, 0.0145, 0.0176],
        [0.9748, 0.0056, 0.0108, 0.0088],
        [0.9754, 0.0057, 0.0088, 0.0102],
        [0.9728, 0.0092, 0.0057, 0.0123],
        [0.9144, 0.0589, 0.0113, 0.0154]])


100%|█████████████████████████████████████████████| 223/223 [10:14<00:00,  2.76s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.8209, 0.1651, 0.0140],
        [0.3502, 0.4070, 0.2427],
        [0.3933, 0.5428, 0.0640],
        [0.0092, 0.2063, 0.7846],
        [0.7514, 0.2260, 0.0227],
        [0.6392, 0.3309, 0.0299],
        [0.9654, 0.0318, 0.0028],
        [0.9519, 0.0453, 0.0028],
        [0.9071, 0.0836, 0.0093],
        [0.8744, 0.1070, 0.0187]])


100%|█████████████████████████████████████████████| 223/223 [10:14<00:00,  2.76s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.9367, 0.0633],
        [0.5810, 0.4190],
        [0.7466, 0.2534],
        [0.8416, 0.1584],
        [0.9071, 0.0929],
        [0.9151, 0.0849],
        [0.6966, 0.3034],
        [0.9403, 0.0597],
        [0.9043, 0.0957],
        [0.2217, 0.7783]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.6807, 0.2219, 0.0974],
        [0.3478, 0.2891, 0.3631],
        [0.4113, 0.4212, 0.1675],
        [0.1863, 0.2813, 0.5324],
        [0.2452, 0.4993, 0.2556],
        [0.8484, 0.1261, 0.0255],
        [0.9498, 0.0352, 0.0151],
        [0.9473, 0.0387, 0.0140],
        [0.9442, 0.0415, 0.0143],
        [0.8307, 0.0996, 0.0697]])


# HateBert labels

In [14]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [15]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.42 s


In [16]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:23<00:00,  5.56s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.5346, 0.4654],
        [0.8211, 0.1789],
        [0.5113, 0.4887],
        [0.7022, 0.2978],
        [0.8697, 0.1303],
        [0.9663, 0.0337],
        [0.0330, 0.9670],
        [0.0478, 0.9522],
        [0.0334, 0.9666],
        [0.0453, 0.9547]])


100%|█████████████████████████████████████████████| 112/112 [10:22<00:00,  5.56s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.7583, 0.2417],
        [0.5592, 0.4408],
        [0.9555, 0.0445],
        [0.9923, 0.0077],
        [0.9424, 0.0576],
        [0.9950, 0.0050],
        [0.0347, 0.9653],
        [0.0252, 0.9748],
        [0.0343, 0.9657],
        [0.0434, 0.9566]])


100%|█████████████████████████████████████████████| 112/112 [10:22<00:00,  5.56s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.6261, 0.3739],
        [0.2559, 0.7441],
        [0.9165, 0.0835],
        [0.8602, 0.1398],
        [0.5675, 0.4325],
        [0.7930, 0.2070],
        [0.0337, 0.9663],
        [0.1289, 0.8711],
        [0.0881, 0.9119],
        [0.0896, 0.9104]])


100%|█████████████████████████████████████████████| 112/112 [10:22<00:00,  5.56s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.4914, 0.5086],
        [0.1822, 0.8178],
        [0.9700, 0.0300],
        [0.9702, 0.0298],
        [0.7350, 0.2650],
        [0.9443, 0.0557],
        [0.0686, 0.9314],
        [0.0676, 0.9324],
        [0.0419, 0.9581],
        [0.0486, 0.9514]])





# Detoxify labels

In [17]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:56<00:00, 618.88s/it]


In [18]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [19]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-01-26 12:44:58,502 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-26 12:44:58,502 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-26 12:44:58,502 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-26 12:44:58,502 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [20]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 292 ms


# Review data

In [21]:
cols = ["length"]
cols += list(char_fns.keys())
cols += list(textstat_fns.keys())
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,405.885412,0.009594,0.772092,0.17203,0.046284,0.087359,1.359175,18.869307,13.324901,64.374832,9.447666,11.069481,4.345113,12.925121,9.071384,9.60632,9.527949,0.453416,0.065631,0.293582,0.02782,0.271754,0.062455,0.528237,0.045657,0.293613,0.063991,0.303525,0.017889,0.107903,0.503642,0.060698,0.259749,0.057953,0.267208,0.027054,0.141256,0.639146,0.495543,0.598782,0.454914,0.539096,0.809933,0.712398,0.243872,0.741873
std,686.41008,0.032208,0.05625,0.02366,0.043608,0.178167,2.745418,40.394226,27.806982,314.099365,48.096992,21.883469,5.232967,90.172615,97.30603,9.258547,3.332032,0.427422,0.151765,0.393595,0.120994,0.365132,0.176465,0.40135,0.119581,0.391484,0.164381,0.360335,0.094395,0.236417,0.414695,0.155554,0.372859,0.16575,0.351134,0.110755,0.284043,0.344428,0.398987,0.331842,0.380188,0.269513,0.27628,0.261847,0.215403,0.224294
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.00053,7.9e-05,0.000152,8.9e-05,0.000164,0.000127,0.000344,1e-06,1.7e-05,6e-05,6.1e-05,1.5e-05,1e-05,0.000126,1.1e-05,5.2e-05,7.2e-05,0.000128,1.7e-05,1.3e-05,0.009207,0.002739,0.008755,0.006117,0.026456,0.00561,0.001072,0.012607,0.009896
1%,22.0,0.0,0.546537,0.098039,0.0,0.0,0.857143,2.5,2.0,-62.339998,-1.9,1.6,0.0,-2.75,-2.91,1.0,1.005,0.000652,8.7e-05,0.000165,9.9e-05,0.000175,0.000136,0.001163,2e-06,6.7e-05,0.000125,0.000101,3.2e-05,3.5e-05,0.000399,1.6e-05,8.4e-05,0.000109,0.000181,2.7e-05,1.8e-05,0.01908,0.004968,0.025318,0.010855,0.079432,0.024452,0.011482,0.031395,0.062929
5%,31.0,0.0,0.682927,0.131579,0.0,0.0,1.0,3.5,2.666667,31.53,0.5,2.4,0.0,0.6,0.63,2.0,6.405,0.000927,9.3e-05,0.000178,0.000109,0.000185,0.000144,0.005095,7e-06,0.000244,0.000319,0.000269,6.2e-05,9.1e-05,0.00103,2.3e-05,0.000146,0.000172,0.000357,3.7e-05,2.4e-05,0.042677,0.009116,0.057021,0.017239,0.134635,0.097865,0.129091,0.04616,0.241349
10%,43.0,0.0,0.722105,0.144737,0.014085,0.009091,1.071429,4.666667,3.5,44.41,1.8,3.2,0.0,2.3,2.6,2.5,7.0,0.001682,0.0001,0.000205,0.000118,0.000212,0.00016,0.014271,1.2e-05,0.000489,0.000546,0.000819,9.4e-05,0.000169,0.003104,3.5e-05,0.000307,0.000279,0.000856,5.6e-05,3.8e-05,0.085093,0.015239,0.096997,0.025143,0.17797,0.270882,0.294951,0.057621,0.399114
20%,68.0,0.0,0.75,0.157609,0.022222,0.017123,1.166667,7.0,5.0,55.880001,3.4,5.0,0.0,4.3,4.52,3.818182,7.69,0.00843,0.000113,0.000447,0.000158,0.000488,0.000249,0.054722,2.7e-05,0.001362,0.00109,0.003542,0.000165,0.000404,0.0195,9.9e-05,0.001278,0.000691,0.003663,0.000155,0.000128,0.214019,0.038358,0.209543,0.047736,0.254646,0.69285,0.49885,0.077879,0.56673
30%,99.0,0.0,0.763889,0.164179,0.027632,0.022222,1.230769,9.0,6.5,62.720001,4.5,6.55,0.0,5.7,5.75,5.0,8.18,0.027982,0.000146,0.00099,0.000272,0.001308,0.000458,0.142656,5.7e-05,0.00357,0.001907,0.011528,0.00026,0.000849,0.08188,0.000258,0.003606,0.001384,0.01034,0.000369,0.000365,0.408991,0.092492,0.3491,0.091836,0.333917,0.868394,0.623472,0.099576,0.682353
40%,137.0,0.0,0.774194,0.169372,0.03271,0.027027,1.285714,11.0,8.0,68.099998,5.6,8.01,0.0,7.0,6.7,6.0,8.59,0.086579,0.000289,0.003025,0.000483,0.004406,0.00106,0.306075,0.00012,0.009758,0.003272,0.034941,0.000396,0.001744,0.239451,0.000619,0.009234,0.002452,0.023855,0.000717,0.000867,0.618935,0.22557,0.523626,0.181195,0.424682,0.927421,0.722277,0.126606,0.770044


In [22]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    syllables_per_word            14251 non-null  float32
 8    syllables_per_sent            14251 non-null  float32
 9    words_per_sent                14251 non-null  float32
 10   flesch_reading_ease           14251 non-null  float32
 11   flesch_kincaid_grade          14251 non-null  float32
 12   gunning_fog                   14251 non-null

In [23]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 505 ms
