In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger': 

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB
Wall time: 225 ms


In [5]:
texts = set(df["less_toxic"]) | set(df["more_toxic"])
texts = list(texts)
df = pd.DataFrame(data={"text": texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
dtypes: object(1)
memory usage: 111.5+ KB


# Preprocess Text

In [6]:
def preprocess(row) -> str:
    return mylib.preprocess(row["text"])


col = "ptext"
df[col] = df.progress_apply(preprocess, axis=1)

100%|███████████████████████████████████████| 14251/14251 [00:10<00:00, 1310.47it/s]


# Character level features

In [7]:
%%time
col = "length"
df[col] = df["ptext"].str.len()
df[col] = df[col].astype(np.int16)
char_fs = [col]

Wall time: 7.02 ms


In [8]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["ptext"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["ptext"])


def space_frac(row) -> float:
    return mylib.space_frac(row["ptext"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["ptext"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["ptext"])

In [9]:
col = "digit_frac"
df[col] = df.progress_apply(digit_frac, axis=1)
df[col] = df[col].astype(np.float32)
char_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23910.67it/s]


In [10]:
col = "letter_frac"
df[col] = df.progress_apply(letter_frac, axis=1)
df[col] = df[col].astype(np.float32)
char_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22254.71it/s]


In [11]:
col = "space_frac"
df[col] = df.progress_apply(space_frac, axis=1)
df[col] = df[col].astype(np.float32)
char_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 24324.88it/s]


In [12]:
col = "punc_frac"
df[col] = df.progress_apply(punc_frac, axis=1)
df[col] = df[col].astype(np.float32)
char_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 20583.59it/s]


In [13]:
col = "upper_frac"
df[col] = df.progress_apply(upper_frac, axis=1)
df[col] = df[col].astype(np.float32)
char_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23136.41it/s]


In [14]:
print(char_fs)

['length', 'digit_frac', 'letter_frac', 'space_frac', 'punc_frac', 'upper_frac']


# Textstat features

In [15]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["ptext"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["ptext"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["ptext"])


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["ptext"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["ptext"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["ptext"])


def smog_index(row) -> float:
    return textstat.smog_index(row["ptext"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["ptext"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["ptext"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["ptext"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["ptext"])

In [16]:
col = "flesch_reading_ease"
df[col] = df.progress_apply(flesch_reading_ease, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs = []
textstat_fs.append(col)

100%|███████████████████████████████████████| 14251/14251 [00:03<00:00, 3839.91it/s]


In [17]:
col = "flesch_kincaid_grade"
df[col] = df.progress_apply(flesch_kincaid_grade, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs.append(col)

100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7399.00it/s]


In [18]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int16)
textstat_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:01<00:00, 10920.54it/s]


In [19]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int16)
textstat_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 47650.04it/s]


In [20]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int16)
textstat_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 28085.50it/s]


In [21]:
col = "gunning_fog"
df[col] = df.progress_apply(gunning_fog, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs.append(col)

100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5908.02it/s]


In [22]:
col = "smog_index"
df[col] = df.progress_apply(smog_index, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs.append(col)

100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8034.36it/s]


In [23]:
col = "automated_readability_index"
df[col] = df.progress_apply(automated_readability_index, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 18206.74it/s]


In [24]:
col = "coleman_liau_index"
df[col] = df.progress_apply(coleman_liau_index, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs.append(col)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 15779.07it/s]


In [25]:
col = "linsear_write_formula"
df[col] = df.progress_apply(linsear_write_formula, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs.append(col)

100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8914.59it/s]


In [26]:
col = "dale_chall_readability_score"
df[col] = df.progress_apply(dale_chall_readability_score, axis=1)
df[col] = df[col].astype(np.float32)
textstat_fs.append(col)

100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5929.22it/s]


In [27]:
print(textstat_fs)

['flesch_reading_ease', 'flesch_kincaid_grade', 'syllable_count', 'lexicon_count', 'sentence_count', 'gunning_fog', 'smog_index', 'automated_readability_index', 'coleman_liau_index', 'linsear_write_formula', 'dale_chall_readability_score']


# TweetEval labels

In [28]:
sentences = list(df["ptext"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:21<00:00,  2.79s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0792, 0.9208],
        [0.1202, 0.8798],
        [0.1112, 0.8888],
        [0.0748, 0.9252],
        [0.8643, 0.1357],
        [0.7094, 0.2906],
        [0.0944, 0.9056],
        [0.0591, 0.9409],
        [0.4862, 0.5138],
        [0.1527, 0.8473]])


100%|█████████████████████████████████████████████| 223/223 [10:20<00:00,  2.78s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.9744, 0.0050, 0.0135, 0.0071],
        [0.9789, 0.0045, 0.0103, 0.0062],
        [0.9646, 0.0142, 0.0070, 0.0141],
        [0.9725, 0.0079, 0.0106, 0.0090],
        [0.8288, 0.0078, 0.0271, 0.1363],
        [0.8803, 0.0076, 0.0705, 0.0416],
        [0.9792, 0.0053, 0.0082, 0.0073],
        [0.9763, 0.0074, 0.0063, 0.0100],
        [0.9685, 0.0100, 0.0089, 0.0126],
        [0.9676, 0.0133, 0.0049, 0.0142]])


100%|█████████████████████████████████████████████| 223/223 [10:21<00:00,  2.79s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.9433, 0.0500, 0.0067],
        [0.9584, 0.0378, 0.0038],
        [0.9230, 0.0609, 0.0161],
        [0.9762, 0.0204, 0.0034],
        [0.7088, 0.2823, 0.0089],
        [0.7100, 0.2748, 0.0153],
        [0.9319, 0.0607, 0.0075],
        [0.9782, 0.0186, 0.0031],
        [0.7164, 0.2475, 0.0361],
        [0.9403, 0.0545, 0.0052]])


100%|█████████████████████████████████████████████| 223/223 [10:20<00:00,  2.78s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.8567, 0.1433],
        [0.8441, 0.1559],
        [0.5461, 0.4539],
        [0.5203, 0.4797],
        [0.7714, 0.2286],
        [0.9508, 0.0492],
        [0.8458, 0.1542],
        [0.3643, 0.6357],
        [0.8852, 0.1148],
        [0.6257, 0.3743]])


100%|█████████████████████████████████████████████| 223/223 [10:21<00:00,  2.78s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.9454, 0.0402, 0.0144],
        [0.9469, 0.0363, 0.0168],
        [0.8909, 0.0724, 0.0367],
        [0.9245, 0.0479, 0.0276],
        [0.6000, 0.3428, 0.0573],
        [0.8284, 0.1298, 0.0418],
        [0.8749, 0.0794, 0.0456],
        [0.9510, 0.0338, 0.0153],
        [0.6347, 0.2513, 0.1140],
        [0.8815, 0.0963, 0.0222]])


# HateBert labels

In [29]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [30]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.28 s


In [31]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:32<00:00,  5.65s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0313, 0.9687],
        [0.0442, 0.9558],
        [0.0347, 0.9653],
        [0.0243, 0.9757],
        [0.5669, 0.4331],
        [0.4349, 0.5651],
        [0.0257, 0.9743],
        [0.0292, 0.9708],
        [0.1020, 0.8980],
        [0.0314, 0.9686]])


100%|█████████████████████████████████████████████| 112/112 [10:33<00:00,  5.66s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.0312, 0.9688],
        [0.0368, 0.9632],
        [0.0557, 0.9443],
        [0.0315, 0.9685],
        [0.8836, 0.1164],
        [0.8578, 0.1422],
        [0.0244, 0.9756],
        [0.0229, 0.9771],
        [0.0573, 0.9427],
        [0.0956, 0.9044]])


100%|█████████████████████████████████████████████| 112/112 [10:28<00:00,  5.61s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0417, 0.9583],
        [0.0724, 0.9276],
        [0.0460, 0.9540],
        [0.0559, 0.9441],
        [0.9131, 0.0869],
        [0.5150, 0.4850],
        [0.0353, 0.9647],
        [0.0290, 0.9710],
        [0.1093, 0.8907],
        [0.0364, 0.9636]])


100%|█████████████████████████████████████████████| 112/112 [10:30<00:00,  5.63s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.0870, 0.9130],
        [0.0788, 0.9212],
        [0.0359, 0.9641],
        [0.0306, 0.9694],
        [0.9777, 0.0223],
        [0.6870, 0.3130],
        [0.0262, 0.9738],
        [0.0273, 0.9727],
        [0.1770, 0.8230],
        [0.0999, 0.9001]])





# Detoxify labels

In [32]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [31:02<00:00, 620.73s/it]


In [33]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [34]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [35]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 299 ms


# Review data

In [36]:
cols = char_fs + textstat_fs + dtfy_fs 
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,flesch_reading_ease,flesch_kincaid_grade,syllable_count,lexicon_count,sentence_count,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,408.604238,0.009755,0.772961,0.173892,0.043391,0.086143,63.909294,9.625759,100.923233,71.034945,3.974388,11.228957,4.309606,13.042587,9.093494,9.9108,9.519163,0.458109,0.067606,0.294862,0.028744,0.277615,0.061937,0.533886,0.046184,0.297274,0.061978,0.30561,0.018006,0.098901,0.509862,0.062107,0.259716,0.05633,0.269497,0.026773,0.135149,0.636137,0.49808,0.603282,0.458325,0.544911,0.812406,0.716381,0.253142,0.741202
std,688.809292,0.032184,0.056109,0.0249,0.043562,0.176861,315.695831,48.136086,169.861304,117.409068,8.581545,21.454451,5.286682,90.704788,97.97451,9.30981,3.388893,0.427506,0.155756,0.39325,0.123391,0.368762,0.177876,0.398096,0.120675,0.39069,0.161861,0.360564,0.095459,0.217916,0.413721,0.158119,0.371604,0.162481,0.351557,0.111058,0.273091,0.346537,0.399806,0.331455,0.380057,0.269201,0.27383,0.261111,0.223106,0.22604
min,8.0,0.0,0.004427,0.000403,0.0,0.0,-36681.820312,-3.1,2.0,2.0,1.0,0.8,0.0,-9.3,-14.15,0.0,0.1,0.000535,7.9e-05,0.000149,8.5e-05,0.000164,0.000127,0.000352,1e-06,1.7e-05,6e-05,6.1e-05,1.6e-05,1e-05,0.000126,1.1e-05,5.2e-05,7.1e-05,0.000128,1.7e-05,1.3e-05,0.009207,0.002739,0.008755,0.005502,0.026456,0.00561,0.001072,0.012607,0.009896
1%,22.0,0.0,0.549739,0.094467,0.0,0.0,-69.965,-1.9,5.0,4.0,1.0,1.6,0.0,-2.9,-2.95,1.0,1.105,0.000662,8.7e-05,0.000165,9.9e-05,0.000175,0.000136,0.00129,3e-06,7.3e-05,0.000136,0.000104,3.4e-05,3.6e-05,0.000405,1.6e-05,8.5e-05,0.000108,0.000181,2.7e-05,1.8e-05,0.018364,0.004938,0.025118,0.010837,0.08333,0.027793,0.01157,0.030483,0.060581
5%,31.0,0.0,0.683673,0.131579,0.0,0.0,30.200001,0.5,8.0,6.0,1.0,2.4,0.0,0.3,0.45,2.0,6.34,0.000958,9.3e-05,0.000179,0.000109,0.000186,0.000145,0.005784,7e-06,0.000271,0.000345,0.000297,6.7e-05,0.0001,0.001091,2.4e-05,0.00015,0.000175,0.000374,3.8e-05,2.5e-05,0.042573,0.009142,0.056917,0.017714,0.138181,0.103325,0.131625,0.046102,0.236119
10%,44.0,0.0,0.722973,0.145038,0.012579,0.008547,43.560001,1.9,11.0,8.0,1.0,3.2,0.0,2.0,2.37,2.5,6.93,0.001849,0.0001,0.000211,0.000119,0.000219,0.000164,0.016431,1.3e-05,0.000534,0.000588,0.000892,0.0001,0.000186,0.003285,3.6e-05,0.000332,0.000293,0.000899,5.9e-05,3.9e-05,0.08274,0.015286,0.098816,0.025853,0.183667,0.280751,0.302562,0.057536,0.391538
20%,69.0,0.0,0.75,0.158537,0.02017,0.016471,55.610001,3.5,17.0,12.0,1.0,5.2,0.0,4.1,4.29,4.0,7.64,0.009245,0.000114,0.000473,0.000164,0.000519,0.00026,0.060106,3e-05,0.001509,0.001155,0.003852,0.000177,0.000442,0.021863,0.000106,0.001394,0.00072,0.00393,0.000166,0.000138,0.204838,0.038415,0.213655,0.049308,0.261686,0.699697,0.503112,0.078457,0.563538
30%,99.0,0.0,0.76506,0.166031,0.025,0.021622,62.68,4.6,24.0,18.0,1.0,6.73,0.0,5.6,5.51,5.0,8.14,0.029829,0.000154,0.001079,0.000285,0.001395,0.000481,0.154698,6.3e-05,0.00413,0.002022,0.012476,0.000273,0.000932,0.089935,0.000281,0.003921,0.001467,0.010699,0.000392,0.000394,0.397953,0.091657,0.35733,0.094714,0.341952,0.871696,0.629301,0.101488,0.682739
40%,138.0,0.0,0.77518,0.171315,0.029557,0.026316,68.099998,5.7,34.0,24.0,2.0,8.04,0.0,6.9,6.49,6.2,8.57,0.09241,0.000305,0.003357,0.000498,0.00468,0.001105,0.327154,0.00013,0.011421,0.003405,0.037481,0.00041,0.001894,0.255412,0.000662,0.009971,0.00254,0.024431,0.00075,0.000917,0.612928,0.22782,0.532059,0.187982,0.431859,0.928815,0.727812,0.129868,0.77047


In [37]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    flesch_reading_ease           14251 non-null  float32
 8    flesch_kincaid_grade          14251 non-null  float32
 9    syllable_count                14251 non-null  int16  
 10   lexicon_count                 14251 non-null  int16  
 11   sentence_count                14251 non-null  int16  
 12   gunning_fog                   14251 non-null

In [38]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 524 ms
