In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple, Callable
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger': 

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_parquet("input/pre_val.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
dtypes: object(3)
memory usage: 334.1+ KB
Wall time: 261 ms


# Character level features

In [5]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 1.1 ms


In [6]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
}

In [7]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23962.62it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23169.38it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22767.72it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 20962.38it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23568.30it/s]


# Textstat features

In [8]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


textstat_fns: Dict[str, Callable] = {
    "syllables_per_word": syllables_per_word,
    "syllables_per_sent": syllables_per_sent,
    "words_per_sent": words_per_sent,
    "flesch_reading_ease": flesch_reading_ease,
    "flesch_kincaid_grade": flesch_kincaid_grade,
    "gunning_fog": gunning_fog,
    "smog_index": smog_index,
    "automated_readability_index": automated_readability_index,
    "coleman_liau_index": coleman_liau_index,
    "linsear_write_formula": linsear_write_formula,
    "dale_chall_readability_score": dale_chall_readability_score,
}

In [9]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 4960.45it/s]


In [10]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 50776.15it/s]


In [11]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 27656.00it/s]


In [12]:
for col, fn in textstat_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 90493.89it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 90354.50it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 94967.81it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7135.31it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7309.37it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5673.81it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8058.35it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 19112.02it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 16655.98it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 9391.24it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 6183.11it/s]


# TweetEval labels

In [13]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:33<00:00,  2.84s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.7808, 0.2192],
        [0.7374, 0.2626],
        [0.5861, 0.4139],
        [0.6431, 0.3569],
        [0.7571, 0.2429],
        [0.8747, 0.1253],
        [0.6308, 0.3692],
        [0.6307, 0.3693],
        [0.5984, 0.4016],
        [0.1603, 0.8397]])


100%|█████████████████████████████████████████████| 223/223 [10:15<00:00,  2.76s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.6117, 0.0391, 0.0387, 0.3105],
        [0.3004, 0.2791, 0.1852, 0.2353],
        [0.9812, 0.0046, 0.0058, 0.0085],
        [0.9441, 0.0094, 0.0242, 0.0223],
        [0.6882, 0.0294, 0.1360, 0.1464],
        [0.9502, 0.0051, 0.0248, 0.0199],
        [0.9512, 0.0061, 0.0251, 0.0176],
        [0.9755, 0.0071, 0.0081, 0.0093],
        [0.9808, 0.0038, 0.0085, 0.0069],
        [0.9796, 0.0061, 0.0087, 0.0056]])


100%|█████████████████████████████████████████████| 223/223 [10:14<00:00,  2.76s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.3801, 0.5967, 0.0232],
        [0.4556, 0.4819, 0.0625],
        [0.6621, 0.2949, 0.0430],
        [0.8434, 0.1474, 0.0092],
        [0.6515, 0.3339, 0.0145],
        [0.7450, 0.2245, 0.0305],
        [0.9520, 0.0458, 0.0022],
        [0.9025, 0.0852, 0.0124],
        [0.8959, 0.0970, 0.0071],
        [0.9670, 0.0289, 0.0041]])


100%|█████████████████████████████████████████████| 223/223 [10:17<00:00,  2.77s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.7995, 0.2005],
        [0.6654, 0.3346],
        [0.8891, 0.1109],
        [0.9110, 0.0890],
        [0.9260, 0.0740],
        [0.9104, 0.0896],
        [0.9090, 0.0910],
        [0.7519, 0.2481],
        [0.2460, 0.7540],
        [0.8470, 0.1530]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.5742, 0.4067, 0.0191],
        [0.5552, 0.3596, 0.0851],
        [0.7190, 0.2055, 0.0755],
        [0.8879, 0.0896, 0.0226],
        [0.4847, 0.3261, 0.1892],
        [0.8670, 0.1030, 0.0300],
        [0.9304, 0.0527, 0.0169],
        [0.6692, 0.2301, 0.1007],
        [0.9190, 0.0647, 0.0163],
        [0.9436, 0.0413, 0.0151]])


# HateBert labels

In [14]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [15]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.38 s


In [16]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:21<00:00,  5.55s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.9580, 0.0420],
        [0.5313, 0.4687],
        [0.3568, 0.6432],
        [0.4591, 0.5409],
        [0.2975, 0.7025],
        [0.9420, 0.0580],
        [0.1742, 0.8258],
        [0.1991, 0.8009],
        [0.2253, 0.7747],
        [0.0307, 0.9693]])


100%|█████████████████████████████████████████████| 112/112 [10:21<00:00,  5.55s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.9930, 0.0070],
        [0.9452, 0.0548],
        [0.7233, 0.2767],
        [0.8935, 0.1065],
        [0.3779, 0.6221],
        [0.9128, 0.0872],
        [0.0970, 0.9030],
        [0.1485, 0.8515],
        [0.2820, 0.7180],
        [0.0292, 0.9708]])


100%|█████████████████████████████████████████████| 112/112 [10:20<00:00,  5.54s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.9157, 0.0843],
        [0.8218, 0.1782],
        [0.3219, 0.6781],
        [0.3107, 0.6893],
        [0.1201, 0.8799],
        [0.7714, 0.2286],
        [0.3239, 0.6761],
        [0.1335, 0.8665],
        [0.2956, 0.7044],
        [0.0418, 0.9582]])


100%|█████████████████████████████████████████████| 112/112 [10:20<00:00,  5.54s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.9841, 0.0159],
        [0.9616, 0.0384],
        [0.3335, 0.6665],
        [0.7939, 0.2061],
        [0.7242, 0.2758],
        [0.9525, 0.0475],
        [0.1641, 0.8359],
        [0.1547, 0.8453],
        [0.3091, 0.6909],
        [0.0326, 0.9674]])





# Detoxify labels

In [17]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:28<00:00, 609.53s/it]


In [18]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [19]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-01-23 08:16:27,802 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-23 08:16:27,802 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [20]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 291 ms


# Review data

In [21]:
cols = ["length"]
cols += list(char_fns.keys())
cols += list(textstat_fns.keys())
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,407.412111,0.009792,0.771801,0.17156,0.046848,0.08748,1.366432,18.896488,13.30002,63.766296,9.520686,11.06833,4.353828,13.135674,9.296756,9.612117,9.58297,0.453714,0.065993,0.293632,0.028148,0.27155,0.062582,0.528137,0.045724,0.293552,0.06413,0.303063,0.01788,0.108263,0.503549,0.06073,0.259472,0.057912,0.266497,0.026968,0.141305,0.637523,0.493512,0.598392,0.453566,0.538566,0.810702,0.713403,0.245387,0.740695
std,687.677043,0.032306,0.056478,0.024278,0.044103,0.17864,2.771002,39.913151,27.424742,315.746368,48.185921,21.574528,5.246092,90.772156,97.969963,9.265683,3.409077,0.427457,0.152426,0.393655,0.121764,0.365148,0.176663,0.401287,0.119749,0.391363,0.16467,0.359989,0.094411,0.236859,0.414591,0.155515,0.372589,0.16559,0.350717,0.110534,0.284084,0.344989,0.399181,0.332022,0.380127,0.269547,0.275511,0.261104,0.215609,0.224091
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.00053,7.9e-05,0.000152,8.9e-05,0.000164,0.000127,0.000352,1e-06,1.7e-05,6e-05,6.1e-05,1.5e-05,1e-05,0.000126,1.1e-05,5.2e-05,6.8e-05,0.000128,1.7e-05,1.3e-05,0.009207,0.002739,0.008755,0.005975,0.026456,0.00561,0.001072,0.012607,0.009896
1%,22.0,0.0,0.542513,0.095238,0.0,0.0,0.857143,2.5,2.0,-71.305,-1.9,1.6,0.0,-2.8,-2.91,1.0,1.065,0.000656,8.7e-05,0.000165,9.9e-05,0.000175,0.000136,0.001169,2e-06,6.8e-05,0.000125,0.000101,3.2e-05,3.5e-05,0.000399,1.6e-05,8.5e-05,0.000108,0.000182,2.7e-05,1.8e-05,0.018584,0.004951,0.025082,0.010681,0.079432,0.025406,0.011926,0.031449,0.0623
5%,31.0,0.0,0.681239,0.130435,0.0,0.0,1.0,3.5,2.666667,30.200001,0.5,2.4,0.0,0.6,0.59,2.0,6.405,0.000928,9.3e-05,0.000178,0.000109,0.000185,0.000144,0.005083,7e-06,0.000246,0.000318,0.000271,6.2e-05,9.1e-05,0.001026,2.3e-05,0.000146,0.000172,0.000357,3.7e-05,2.4e-05,0.042358,0.009029,0.056524,0.017131,0.134583,0.098254,0.130911,0.046637,0.242732
10%,43.0,0.0,0.721519,0.142857,0.014085,0.009124,1.071429,4.666667,3.5,43.43,1.8,3.2,0.0,2.3,2.6,2.5,7.0,0.001703,0.0001,0.000206,0.000118,0.000212,0.00016,0.014353,1.2e-05,0.000495,0.000546,0.000815,9.4e-05,0.000169,0.003111,3.5e-05,0.00031,0.000278,0.000856,5.5e-05,3.7e-05,0.084359,0.015033,0.095996,0.024981,0.177392,0.273595,0.297649,0.058233,0.399107
20%,69.0,0.0,0.75,0.156734,0.022453,0.017094,1.166667,7.0,5.0,55.400002,3.4,5.01,0.0,4.3,4.52,3.8,7.71,0.00843,0.000113,0.000448,0.000158,0.000485,0.000249,0.054988,2.7e-05,0.001367,0.001093,0.00358,0.000165,0.000404,0.019481,9.9e-05,0.001277,0.000691,0.003652,0.000154,0.000128,0.208568,0.037428,0.208174,0.047531,0.253881,0.694204,0.500788,0.079083,0.56313
30%,99.0,0.0,0.763589,0.163743,0.027778,0.022263,1.230769,9.0,6.5,62.48,4.5,6.56,0.0,5.8,5.76,5.0,8.19,0.028082,0.000148,0.000999,0.000274,0.001306,0.000458,0.142352,5.8e-05,0.003603,0.001903,0.011519,0.00026,0.000849,0.082063,0.00026,0.0036,0.001383,0.010308,0.000368,0.000365,0.406364,0.089922,0.349834,0.090686,0.33378,0.869531,0.624665,0.100994,0.680337
40%,137.0,0.0,0.773862,0.169056,0.032967,0.027027,1.285714,11.0,8.0,67.889999,5.6,8.01,0.0,7.0,6.72,6.0,8.62,0.086442,0.00029,0.00309,0.000485,0.004417,0.001057,0.305504,0.000121,0.009863,0.003281,0.034941,0.000394,0.001747,0.239454,0.00062,0.009246,0.002448,0.023793,0.000716,0.000862,0.615134,0.221613,0.523326,0.179076,0.423257,0.927623,0.722531,0.12866,0.767424


In [22]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    syllables_per_word            14251 non-null  float32
 8    syllables_per_sent            14251 non-null  float32
 9    words_per_sent                14251 non-null  float32
 10   flesch_reading_ease           14251 non-null  float32
 11   flesch_kincaid_grade          14251 non-null  float32
 12   gunning_fog                   14251 non-null

In [23]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 497 ms
