In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple, Callable
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger': 

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_parquet("input/pre_val.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
 1   text1   14251 non-null  object
 2   text2   14251 non-null  object
dtypes: object(3)
memory usage: 334.1+ KB
Wall time: 66 ms


# Character level features

In [5]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 6 ms


In [6]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
}

In [7]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23622.89it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22153.58it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23388.05it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 21226.96it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23726.65it/s]


# Textstat features

In [8]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


textstat_fns: Dict[str, Callable] = {
    "syllables_per_word": syllables_per_word,
    "syllables_per_sent": syllables_per_sent,
    "words_per_sent": words_per_sent,
    "flesch_reading_ease": flesch_reading_ease,
    "flesch_kincaid_grade": flesch_kincaid_grade,
    "gunning_fog": gunning_fog,
    "smog_index": smog_index,
    "automated_readability_index": automated_readability_index,
    "coleman_liau_index": coleman_liau_index,
    "linsear_write_formula": linsear_write_formula,
    "dale_chall_readability_score": dale_chall_readability_score,
}

In [9]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5157.41it/s]


In [10]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 49031.62it/s]


In [11]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 26907.89it/s]


In [12]:
for col, fn in textstat_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 82795.35it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 92691.05it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 87525.02it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7250.34it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7365.88it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5826.81it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7614.62it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 18131.83it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 15846.24it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8842.24it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5929.81it/s]


# TweetEval labels

In [13]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:14<00:00,  2.75s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.1779, 0.8221],
        [0.6278, 0.3722],
        [0.5331, 0.4669],
        [0.1771, 0.8229],
        [0.5490, 0.4510],
        [0.1296, 0.8704],
        [0.7580, 0.2420],
        [0.1206, 0.8794],
        [0.8800, 0.1200],
        [0.8732, 0.1268]])


100%|█████████████████████████████████████████████| 223/223 [10:12<00:00,  2.75s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.9716, 0.0062, 0.0137, 0.0085],
        [0.9614, 0.0053, 0.0239, 0.0095],
        [0.9339, 0.0133, 0.0228, 0.0300],
        [0.9768, 0.0053, 0.0098, 0.0081],
        [0.8097, 0.0286, 0.1144, 0.0474],
        [0.9795, 0.0090, 0.0055, 0.0060],
        [0.9220, 0.0076, 0.0385, 0.0319],
        [0.9709, 0.0047, 0.0167, 0.0078],
        [0.8384, 0.0119, 0.1101, 0.0396],
        [0.9004, 0.0064, 0.0601, 0.0331]])


100%|█████████████████████████████████████████████| 223/223 [10:12<00:00,  2.75s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.9658, 0.0312, 0.0030],
        [0.6346, 0.3339, 0.0315],
        [0.6467, 0.3247, 0.0286],
        [0.9028, 0.0923, 0.0048],
        [0.6146, 0.3350, 0.0505],
        [0.9616, 0.0306, 0.0078],
        [0.6320, 0.3449, 0.0231],
        [0.9372, 0.0572, 0.0055],
        [0.2759, 0.6508, 0.0733],
        [0.4407, 0.5047, 0.0546]])


100%|█████████████████████████████████████████████| 223/223 [10:12<00:00,  2.75s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.8226, 0.1774],
        [0.9538, 0.0462],
        [0.8485, 0.1515],
        [0.9604, 0.0396],
        [0.1811, 0.8189],
        [0.7768, 0.2232],
        [0.6647, 0.3353],
        [0.9594, 0.0406],
        [0.8566, 0.1434],
        [0.9214, 0.0786]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.8810, 0.0782, 0.0408],
        [0.8517, 0.0957, 0.0526],
        [0.7105, 0.2077, 0.0818],
        [0.8994, 0.0778, 0.0228],
        [0.5345, 0.3990, 0.0665],
        [0.9132, 0.0655, 0.0213],
        [0.6904, 0.2476, 0.0620],
        [0.8444, 0.1347, 0.0208],
        [0.6653, 0.2600, 0.0746],
        [0.6073, 0.3195, 0.0732]])


# HateBert labels

In [14]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [15]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 3.98 s


In [16]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:21<00:00,  5.55s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.2983, 0.7017],
        [0.8147, 0.1853],
        [0.1562, 0.8438],
        [0.1588, 0.8412],
        [0.9028, 0.0972],
        [0.0285, 0.9715],
        [0.9294, 0.0706],
        [0.0343, 0.9657],
        [0.9233, 0.0767],
        [0.9483, 0.0517]])


100%|█████████████████████████████████████████████| 112/112 [10:21<00:00,  5.55s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.1513, 0.8487],
        [0.9796, 0.0204],
        [0.0699, 0.9301],
        [0.5108, 0.4892],
        [0.9945, 0.0055],
        [0.0372, 0.9628],
        [0.9839, 0.0161],
        [0.0294, 0.9706],
        [0.9913, 0.0087],
        [0.9932, 0.0068]])


100%|█████████████████████████████████████████████| 112/112 [10:21<00:00,  5.55s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0914, 0.9086],
        [0.3504, 0.6496],
        [0.1702, 0.8298],
        [0.4027, 0.5973],
        [0.8319, 0.1681],
        [0.0522, 0.9478],
        [0.8997, 0.1003],
        [0.1703, 0.8297],
        [0.6767, 0.3233],
        [0.9441, 0.0559]])


100%|█████████████████████████████████████████████| 112/112 [10:21<00:00,  5.55s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.1551, 0.8449],
        [0.5136, 0.4864],
        [0.0712, 0.9288],
        [0.6917, 0.3083],
        [0.8592, 0.1408],
        [0.3414, 0.6586],
        [0.9536, 0.0464],
        [0.0703, 0.9297],
        [0.9142, 0.0858],
        [0.9844, 0.0156]])





# Detoxify labels

In [17]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:09<00:00, 603.24s/it]


In [18]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [19]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-01-30 13:23:41,499 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-30 13:23:41,499 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-30 13:23:41,499 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-30 13:23:41,499 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [20]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 293 ms


# Review data

In [21]:
cols = ["length"]
cols += list(char_fns.keys())
cols += list(textstat_fns.keys())
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,404.495825,0.003891,0.778295,0.172929,0.044886,0.088107,1.360076,18.846472,13.300896,64.28672,9.450614,11.062636,4.342566,12.748144,8.927725,9.590234,9.313502,0.453456,0.064607,0.294084,0.027572,0.271963,0.062425,0.529311,0.044723,0.292764,0.064443,0.308312,0.017978,0.107717,0.501251,0.058705,0.258191,0.058524,0.270354,0.027115,0.138774,0.6396,0.497501,0.601631,0.457546,0.540523,0.810584,0.713279,0.253815,0.74576
std,686.044494,0.016081,0.047163,0.023382,0.043165,0.179211,2.745461,40.391994,27.804605,314.100159,48.095737,21.882164,5.233645,90.17897,97.308975,9.253856,3.154639,0.427223,0.148911,0.393717,0.120505,0.365127,0.176418,0.402319,0.116356,0.392156,0.165872,0.363142,0.094901,0.236483,0.414678,0.151742,0.372959,0.167616,0.353014,0.111837,0.282632,0.345512,0.39975,0.332228,0.381498,0.270357,0.275462,0.261218,0.220133,0.222764
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.00053,7.9e-05,0.000152,8.9e-05,0.000164,0.000124,0.000344,1e-06,1.7e-05,5.7e-05,6.1e-05,1.5e-05,1e-05,0.000142,1.1e-05,5.6e-05,6.9e-05,0.000128,1.6e-05,1.3e-05,0.009355,0.002915,0.008755,0.006311,0.026456,0.005595,0.001072,0.018925,0.009896
1%,21.0,0.0,0.616026,0.1,0.0,0.0,0.857143,2.5,2.0,-62.339998,-1.9,1.6,0.0,-2.8,-3.06,1.0,0.6,0.000638,8.7e-05,0.000166,9.9e-05,0.000175,0.000136,0.001118,2e-06,6.1e-05,0.000121,9.7e-05,3.1e-05,3e-05,0.000381,1.6e-05,8.4e-05,0.000108,0.00018,2.7e-05,1.8e-05,0.018738,0.004983,0.024701,0.01065,0.08032,0.024533,0.011482,0.032187,0.061426
5%,30.0,0.0,0.708976,0.134146,0.0,0.0,1.0,3.5,2.5,31.219999,0.5,2.4,0.0,0.4,0.45,2.0,6.33,0.000902,9.3e-05,0.000178,0.000109,0.000184,0.000143,0.004588,6e-06,0.000225,0.000304,0.00026,6e-05,8.8e-05,0.000962,2.4e-05,0.000146,0.000169,0.000354,3.8e-05,2.4e-05,0.042534,0.009093,0.057148,0.017225,0.135846,0.099382,0.135426,0.047117,0.24346
10%,43.0,0.0,0.735294,0.146497,0.013605,0.009153,1.071429,4.666667,3.5,44.240002,1.8,3.2,0.0,2.1,2.41,2.5,6.92,0.001632,0.0001,0.000202,0.000118,0.000211,0.000159,0.013299,1.1e-05,0.000457,0.000525,0.000784,9.2e-05,0.000161,0.003006,3.5e-05,0.000311,0.000272,0.000855,5.6e-05,3.7e-05,0.082899,0.015376,0.098087,0.025102,0.179488,0.275754,0.299614,0.059038,0.409993
20%,67.0,0.0,0.75641,0.15873,0.021739,0.017241,1.166667,7.0,5.0,55.740002,3.4,5.0,0.0,4.2,4.35,3.8,7.6,0.008378,0.000114,0.000445,0.000157,0.000488,0.000249,0.053169,2.6e-05,0.001316,0.001083,0.003497,0.000163,0.000388,0.019163,9.6e-05,0.001246,0.000661,0.003701,0.000154,0.000128,0.210326,0.038514,0.208715,0.047678,0.252734,0.69218,0.498913,0.08016,0.573763
30%,97.0,0.0,0.768421,0.16527,0.026846,0.022472,1.230769,9.0,6.5,62.68,4.5,6.55,0.0,5.6,5.63,4.9,8.08,0.028229,0.000148,0.001001,0.000272,0.001302,0.00046,0.141701,5.6e-05,0.003462,0.001895,0.011569,0.000259,0.000824,0.079479,0.000244,0.003418,0.001322,0.010421,0.000365,0.000353,0.406858,0.090723,0.352795,0.091763,0.333905,0.8693,0.622235,0.103981,0.690566
40%,136.0,0.0,0.777202,0.170213,0.031746,0.027174,1.285714,11.0,8.0,68.059998,5.6,8.01,0.0,6.8,6.56,6.0,8.5,0.086931,0.000292,0.00309,0.00048,0.004482,0.001061,0.306635,0.000118,0.009481,0.003202,0.035604,0.000395,0.001711,0.232045,0.000585,0.008615,0.002349,0.024282,0.000693,0.000819,0.619393,0.226476,0.530648,0.180844,0.427558,0.927613,0.723054,0.13303,0.775033


In [22]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    syllables_per_word            14251 non-null  float32
 8    syllables_per_sent            14251 non-null  float32
 9    words_per_sent                14251 non-null  float32
 10   flesch_reading_ease           14251 non-null  float32
 11   flesch_kincaid_grade          14251 non-null  float32
 12   gunning_fog                   14251 non-null

In [23]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 498 ms
