In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.stats import rankdata
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple, Callable
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    pretrained_dir: str = "pretrained/"
    dtfy_model_max_length: int = 512
    dtfy_batch_size: int = 64
    dtfy_models: Dict[str, str] = {
        "dto_": f"{pretrained_dir}unitaryai/detoxify/toxic_original-c1212f89.ckpt",
        "dtu_": f"{pretrained_dir}unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
        "dtm_": f"{pretrained_dir}unitaryai/detoxify/multilingual_debiased-0b549669.ckpt"
    }
    dtfy_configs: Dict[str, str] = {
        "dto_": f"{pretrained_dir}bert-base-uncased",
        "dtu_": f"{pretrained_dir}roberta-base",
        "dtm_": f"{pretrained_dir}xlm-roberta-base"
    }
    tweeteval_model_max_length: int = 512
    tweeteval_batch_size: int = 64
    tweeteval_models: Dict[str, str] = {
        "te_roberta_off": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-offensive",
        "te_roberta_emo_anger": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-emotion",
        "te_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-sentiment",
        "te_roberta_iro": f"{pretrained_dir}cardiffnlp/twitter-roberta-base-irony",
        "te_xlm_roberta_snt_neg": f"{pretrained_dir}cardiffnlp/twitter-xlm-roberta-base-sentiment",
    }
    tweeteval_label_index: Dict[str, int] = {
        "te_roberta_off": 1,
        "te_roberta_emo_anger": 0,
        "te_roberta_snt_neg": 0,
        "te_roberta_iro": 1,
        "te_xlm_roberta_snt_neg": 0,
    }
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_off": f"{pretrained_dir}/hatebert/bert-offenseval",
        "hb_bert_abu" : f"{pretrained_dir}/hatebert/bert-abuseval",
        "hb_hatebert_off": f"{pretrained_dir}/hatebert/hatebert-offenseval",
        "hb_hatebert_abu" : f"{pretrained_dir}/hatebert/hatebert-abuseval",
    }
    em_max_seq_length: int = 128
    em_batch_size: int = 1000
    em_models: Dict[str, str] = {
        "paraphrase-MiniLM-L6-v2": f"{pretrained_dir}sentence-transformers/paraphrase-MiniLM-L6-v2"
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), pretrained_dir='pretrained/', dtfy_model_max_length=512, dtfy_batch_size=64, dtfy_models={'dto_': 'pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt', 'dtu_': 'pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt', 'dtm_': 'pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt'}, dtfy_configs={'dto_': 'pretrained/bert-base-uncased', 'dtu_': 'pretrained/roberta-base', 'dtm_': 'pretrained/xlm-roberta-base'}, tweeteval_model_max_length=512, tweeteval_batch_size=64, tweeteval_models={'te_roberta_off': 'pretrained/cardiffnlp/twitter-roberta-base-offensive', 'te_roberta_emo_anger': 'pretrained/cardiffnlp/twitter-roberta-base-emotion', 'te_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-roberta-base-sentiment', 'te_roberta_iro': 'pretrained/cardiffnlp/twitter-roberta-base-irony', 'te_xlm_roberta_snt_neg': 'pretrained/cardiffnlp/twitter-xlm-roberta-base-sentiment'}, tweeteval_label_index={'te_roberta_off': 1, 'te_roberta_emo_anger': 

In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
%%time
df = pd.read_csv("input/validation_data.csv", engine="c", low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30108 entries, 0 to 30107
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   worker      30108 non-null  int64 
 1   less_toxic  30108 non-null  object
 2   more_toxic  30108 non-null  object
dtypes: int64(1), object(2)
memory usage: 705.8+ KB
Wall time: 220 ms


In [5]:
texts = set(df["less_toxic"]) | set(df["more_toxic"])
texts = list(texts)
df = pd.DataFrame(data={"text": texts})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    14251 non-null  object
dtypes: object(1)
memory usage: 111.5+ KB


# Preprocess Text
Speed dropped from 1200 to 1000 it/s

In [6]:
def pre1(row) -> str:
    return mylib.pre1(row["text"])


def pre2(row) -> str:
    return mylib.pre2(row["text1"])


col = "text1"
print(col)
df[col] = df.progress_apply(pre1, axis=1)
col = "text2"
print(col)
df[col] = df.progress_apply(pre2, axis=1)

text1


100%|████████████████████████████████████████| 14251/14251 [00:30<00:00, 467.49it/s]


text2


100%|█████████████████████████████████████████| 14251/14251 [02:37<00:00, 90.76it/s]


# Character level features

In [7]:
%%time
col = "length"
df[col] = df["text1"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 3.66 ms


In [8]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text1"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text1"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text1"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text1"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text1"])


char_fns: Dict[str, Callable] = {
    "digit_frac": digit_frac,
    "letter_frac": letter_frac,
    "space_frac": space_frac,
    "punc_frac": punc_frac,
    "upper_frac": upper_frac,
}

In [9]:
for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

digit_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23241.63it/s]


letter_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 22189.25it/s]


space_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23914.66it/s]


punc_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 21218.78it/s]


upper_frac


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 23842.51it/s]


# Textstat features

In [10]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text1"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text1"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text1"])


def syllables_per_word(row) -> float:
    return row["syllable_count"] / (row["lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["syllable_count"] / (row["sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["lexicon_count"] / (row["sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text1"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text1"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text1"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text1"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text1"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text1"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text1"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text1"])


textstat_fns: Dict[str, Callable] = {
    "syllables_per_word": syllables_per_word,
    "syllables_per_sent": syllables_per_sent,
    "words_per_sent": words_per_sent,
    "flesch_reading_ease": flesch_reading_ease,
    "flesch_kincaid_grade": flesch_kincaid_grade,
    "gunning_fog": gunning_fog,
    "smog_index": smog_index,
    "automated_readability_index": automated_readability_index,
    "coleman_liau_index": coleman_liau_index,
    "linsear_write_formula": linsear_write_formula,
    "dale_chall_readability_score": dale_chall_readability_score,
}

In [11]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 4751.79it/s]


In [12]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 47342.80it/s]


In [13]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int32)

100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 25958.01it/s]


In [14]:
for col, fn in textstat_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

syllables_per_word


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 78069.87it/s]


syllables_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 79875.65it/s]


words_per_sent


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 81114.93it/s]


flesch_reading_ease


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 7077.13it/s]


flesch_kincaid_grade


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7423.82it/s]


gunning_fog


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5544.53it/s]


smog_index


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 7651.14it/s]


automated_readability_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 17782.12it/s]


coleman_liau_index


100%|██████████████████████████████████████| 14251/14251 [00:00<00:00, 15502.04it/s]


linsear_write_formula


100%|███████████████████████████████████████| 14251/14251 [00:01<00:00, 8833.59it/s]


dale_chall_readability_score


100%|███████████████████████████████████████| 14251/14251 [00:02<00:00, 5693.49it/s]


# TweetEval labels

In [15]:
sentences = list(df["text2"])
for col, model_dir in conf.tweeteval_models.items():
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir, 
        model_max_length=conf.tweeteval_model_max_length
    )
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    x = tokenizer(sentences, truncation=True, padding="max_length")
    batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.tweeteval_batch_size, shuffle=False)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,conf.tweeteval_label_index[col]]
    df[col] = df[col].astype(np.float32)
    del tokenizer, model
    gc.collect()

100%|█████████████████████████████████████████████| 223/223 [10:21<00:00,  2.79s/it]


te_roberta_off torch.Size([14251, 2])
logits[:10]=tensor([[0.1397, 0.8603],
        [0.8128, 0.1872],
        [0.2398, 0.7602],
        [0.2515, 0.7485],
        [0.3907, 0.6093],
        [0.0815, 0.9185],
        [0.8577, 0.1423],
        [0.5039, 0.4961],
        [0.6038, 0.3962],
        [0.3523, 0.6477]])


100%|█████████████████████████████████████████████| 223/223 [10:14<00:00,  2.76s/it]


te_roberta_emo_anger torch.Size([14251, 4])
logits[:10]=tensor([[0.9287, 0.0266, 0.0172, 0.0276],
        [0.9377, 0.0054, 0.0275, 0.0295],
        [0.9844, 0.0051, 0.0055, 0.0050],
        [0.9618, 0.0202, 0.0080, 0.0100],
        [0.9585, 0.0051, 0.0193, 0.0171],
        [0.9701, 0.0192, 0.0043, 0.0064],
        [0.0153, 0.9361, 0.0350, 0.0137],
        [0.9691, 0.0054, 0.0103, 0.0152],
        [0.9654, 0.0041, 0.0170, 0.0135],
        [0.9684, 0.0061, 0.0146, 0.0109]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.8687, 0.1121, 0.0191],
        [0.5913, 0.3744, 0.0343],
        [0.9542, 0.0399, 0.0059],
        [0.7556, 0.2196, 0.0247],
        [0.8480, 0.1458, 0.0061],
        [0.9341, 0.0555, 0.0103],
        [0.0076, 0.2468, 0.7456],
        [0.6674, 0.2979, 0.0347],
        [0.8131, 0.1760, 0.0109],
        [0.9388, 0.0553, 0.0059]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_roberta_iro torch.Size([14251, 2])
logits[:10]=tensor([[0.8857, 0.1143],
        [0.9194, 0.0806],
        [0.9298, 0.0702],
        [0.8662, 0.1338],
        [0.8927, 0.1073],
        [0.7058, 0.2942],
        [0.9446, 0.0554],
        [0.8868, 0.1132],
        [0.9016, 0.0984],
        [0.9524, 0.0476]])


100%|█████████████████████████████████████████████| 223/223 [10:13<00:00,  2.75s/it]


te_xlm_roberta_snt_neg torch.Size([14251, 3])
logits[:10]=tensor([[0.4701, 0.3844, 0.1455],
        [0.5847, 0.2838, 0.1315],
        [0.8293, 0.1194, 0.0513],
        [0.6865, 0.2429, 0.0707],
        [0.9290, 0.0574, 0.0135],
        [0.5421, 0.2217, 0.2362],
        [0.4395, 0.3350, 0.2255],
        [0.5461, 0.2892, 0.1648],
        [0.8612, 0.1198, 0.0190],
        [0.9224, 0.0551, 0.0225]])


# HateBert labels

In [16]:
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_off"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained//hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [17]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=14251
Wall time: 4.31 s


In [18]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{col} {logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████████| 112/112 [10:24<00:00,  5.57s/it]


hb_bert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0386, 0.9614],
        [0.9280, 0.0720],
        [0.0243, 0.9757],
        [0.0712, 0.9288],
        [0.3437, 0.6563],
        [0.0270, 0.9730],
        [0.9634, 0.0366],
        [0.3335, 0.6665],
        [0.5573, 0.4427],
        [0.0716, 0.9284]])


100%|█████████████████████████████████████████████| 112/112 [10:22<00:00,  5.56s/it]


hb_bert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.1901, 0.8099],
        [0.9718, 0.0282],
        [0.1022, 0.8978],
        [0.4192, 0.5808],
        [0.3764, 0.6236],
        [0.0352, 0.9648],
        [0.9888, 0.0112],
        [0.6041, 0.3959],
        [0.8751, 0.1249],
        [0.0348, 0.9652]])


100%|█████████████████████████████████████████████| 112/112 [10:22<00:00,  5.56s/it]


hb_hatebert_off torch.Size([14251, 2])
logits[:10]=tensor([[0.0329, 0.9671],
        [0.9379, 0.0621],
        [0.0430, 0.9570],
        [0.0728, 0.9272],
        [0.7843, 0.2157],
        [0.0243, 0.9757],
        [0.9516, 0.0484],
        [0.5494, 0.4506],
        [0.5759, 0.4241],
        [0.0320, 0.9680]])


100%|█████████████████████████████████████████████| 112/112 [10:22<00:00,  5.55s/it]

hb_hatebert_abu torch.Size([14251, 2])
logits[:10]=tensor([[0.0889, 0.9111],
        [0.9801, 0.0199],
        [0.0720, 0.9280],
        [0.1706, 0.8294],
        [0.7743, 0.2257],
        [0.0275, 0.9725],
        [0.9847, 0.0153],
        [0.8329, 0.1671],
        [0.9229, 0.0771],
        [0.0337, 0.9663]])





# Detoxify labels

In [19]:
gc.collect()
dtfy_fs = []
for prefix, checkpoint in tqdm(conf.dtfy_models.items()):
    res = mylib.detoxify_labels(
        sentences,
        checkpoint=checkpoint,
        config_dir=conf.dtfy_configs[prefix],
        model_max_length=conf.dtfy_model_max_length,
        device=conf.device,
        batch_size=conf.dtfy_batch_size
    )
    for k, v in res.items():
        col = prefix + k
        df[col] = v
        df[col] = df[col].astype(np.float32)
        dtfy_fs.append(col)
    gc.collect()

100%|████████████████████████████████████████████████| 3/3 [30:37<00:00, 612.39s/it]


In [20]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [21]:
model = SentenceTransformer(conf.em_models["paraphrase-MiniLM-L6-v2"], device=conf.device)
model.max_seq_length = conf.em_max_seq_length
em = model.encode(sentences=sentences, batch_size=conf.em_batch_size, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

[INFO|SentenceTransformer.py:60] 2022-01-09 14:40:11,876 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2
[INFO|SentenceTransformer.py:60] 2022-01-09 14:40:11,876 >> Load pretrained SentenceTransformer: pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2


Batches:   0%|          | 0/15 [00:00<?, ?it/s]

em.shape=(14251, 384)


In [22]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

  self[col] = igetitem(value, i)


Wall time: 287 ms


# Review data

In [23]:
cols = ["length"]
cols += list(char_fns.keys())
cols += list(textstat_fns.keys())
cols += dtfy_fs
cols += list(conf.hatebert_models.keys())
cols += list(conf.tweeteval_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllables_per_word,syllables_per_sent,words_per_sent,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_off,hb_bert_abu,hb_hatebert_off,hb_hatebert_abu,te_roberta_off,te_roberta_emo_anger,te_roberta_snt_neg,te_roberta_iro,te_xlm_roberta_snt_neg
count,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0,14251.0
mean,407.412111,0.009792,0.771801,0.17156,0.046848,0.08748,1.366432,18.896488,13.30002,63.766296,9.520686,11.068329,4.353827,13.135674,9.296756,9.612116,9.58297,0.454988,0.066823,0.291729,0.028435,0.273121,0.064092,0.530462,0.045358,0.293938,0.064117,0.303564,0.017904,0.103149,0.505253,0.060342,0.257404,0.05739,0.266757,0.026837,0.13679,0.636018,0.496522,0.600032,0.456885,0.540526,0.811187,0.714823,0.248829,0.742355
std,687.677043,0.032306,0.056478,0.024278,0.044103,0.17864,2.771002,39.913151,27.424742,315.746368,48.185921,21.574528,5.246092,90.772156,97.969963,9.265683,3.409077,0.427067,0.15423,0.392477,0.122391,0.366068,0.178231,0.399053,0.119012,0.39001,0.164617,0.359798,0.094893,0.227375,0.413534,0.15524,0.371028,0.16444,0.350154,0.110971,0.277266,0.345783,0.399294,0.331816,0.379933,0.268306,0.275263,0.260813,0.218755,0.224479
min,8.0,0.0,0.004427,0.000403,0.0,0.0,0.666667,1.0,1.0,-36681.820312,-3.1,0.8,0.0,-9.3,-14.15,0.0,0.1,0.00053,7.9e-05,0.000152,8.9e-05,0.000164,0.000127,0.000352,1e-06,1.7e-05,6e-05,6.1e-05,1.5e-05,1e-05,0.000126,1.1e-05,5.2e-05,7.1e-05,0.000128,1.7e-05,1.3e-05,0.009207,0.002739,0.008755,0.006323,0.026456,0.00561,0.001072,0.012607,0.009896
1%,22.0,0.0,0.542513,0.095238,0.0,0.0,0.857143,2.5,2.0,-71.305,-1.9,1.6,0.0,-2.8,-2.91,1.0,1.065,0.000658,8.7e-05,0.000165,9.9e-05,0.000175,0.000136,0.001252,3e-06,7.1e-05,0.000129,0.000102,3.3e-05,3.6e-05,0.000399,1.6e-05,8.5e-05,0.000109,0.000183,2.7e-05,1.8e-05,0.018329,0.004951,0.025242,0.010901,0.083077,0.025285,0.012544,0.031106,0.061912
5%,31.0,0.0,0.681239,0.130435,0.0,0.0,1.0,3.5,2.666667,30.200001,0.5,2.4,0.0,0.6,0.59,2.0,6.405,0.000953,9.3e-05,0.000179,0.000109,0.000186,0.000144,0.005505,7e-06,0.000262,0.000338,0.000294,6.4e-05,9.7e-05,0.001055,2.4e-05,0.00015,0.000173,0.00037,3.8e-05,2.5e-05,0.042079,0.009088,0.056524,0.017461,0.137631,0.099159,0.130973,0.046449,0.2431
10%,43.0,0.0,0.721519,0.142857,0.014085,0.009124,1.071429,4.666667,3.5,43.43,1.8,3.2,0.0,2.3,2.6,2.5,7.0,0.001794,0.0001,0.000208,0.000118,0.000216,0.000162,0.015735,1.3e-05,0.000523,0.000572,0.000876,9.7e-05,0.00018,0.003269,3.6e-05,0.000326,0.000287,0.000891,5.8e-05,3.9e-05,0.082962,0.015266,0.0969,0.025456,0.181492,0.274409,0.299938,0.057995,0.397143
20%,69.0,0.0,0.75,0.156734,0.022453,0.017094,1.166667,7.0,5.0,55.400002,3.4,5.01,0.0,4.3,4.52,3.8,7.71,0.008914,0.000113,0.000459,0.000162,0.000501,0.000256,0.058715,2.9e-05,0.001465,0.001138,0.003734,0.000171,0.000427,0.020905,0.000104,0.001339,0.000707,0.003795,0.00016,0.000134,0.206919,0.038233,0.211099,0.049009,0.259176,0.695375,0.503196,0.079016,0.565848
30%,99.0,0.0,0.763589,0.163743,0.027778,0.022263,1.230769,9.0,6.5,62.48,4.5,6.56,0.0,5.8,5.76,5.0,8.19,0.028775,0.000151,0.001039,0.000279,0.00134,0.000469,0.149665,6.1e-05,0.003866,0.00199,0.012086,0.000269,0.000903,0.085488,0.000271,0.003783,0.001424,0.010541,0.00038,0.000378,0.402042,0.091947,0.352413,0.093682,0.336574,0.870142,0.626855,0.101765,0.684081
40%,137.0,0.0,0.773862,0.169056,0.032967,0.027027,1.285714,11.0,8.0,67.889999,5.6,8.01,0.0,7.0,6.72,6.0,8.62,0.090294,0.000296,0.003233,0.000491,0.004578,0.001083,0.316455,0.000127,0.010615,0.003361,0.035224,0.000405,0.001834,0.246869,0.000642,0.009463,0.00251,0.023865,0.00073,0.000897,0.611458,0.225968,0.526247,0.183973,0.42669,0.928209,0.725314,0.129554,0.771199


In [24]:
cols = ["text"] + cols + em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14251 entries, 0 to 14250
Data columns (total 431 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    text                          14251 non-null  object 
 1    length                        14251 non-null  int16  
 2    digit_frac                    14251 non-null  float32
 3    letter_frac                   14251 non-null  float32
 4    space_frac                    14251 non-null  float32
 5    punc_frac                     14251 non-null  float32
 6    upper_frac                    14251 non-null  float32
 7    syllables_per_word            14251 non-null  float32
 8    syllables_per_sent            14251 non-null  float32
 9    words_per_sent                14251 non-null  float32
 10   flesch_reading_ease           14251 non-null  float32
 11   flesch_kincaid_grade          14251 non-null  float32
 12   gunning_fog                   14251 non-null

In [25]:
%%time
df[cols].to_parquet("output/val.parquet", index=False)

Wall time: 498 ms
