In [1]:
import os
import gc
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.stats import rankdata
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import textstat
from tqdm import tqdm
from typing import Dict, NamedTuple
import scml
import mylib

In [2]:
class Conf(NamedTuple):
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    hatebert_model_max_length: int = 512
    hatebert_batch_size: int = 128
    hatebert_models: Dict[str, str] = {
        "hb_bert_offenseval": "pretrained/hatebert/bert-offenseval",
        "hb_bert_abuseval" : "pretrained/hatebert/bert-abuseval",
        "hb_hatebert_offenseval": "pretrained/hatebert/hatebert-offenseval",
        "hb_hatebert_abuseval" : "pretrained/hatebert/hatebert-abuseval",
    }
        
        
conf = Conf()
print(conf)
if conf.device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Conf(device=device(type='cuda'), hatebert_model_max_length=512, hatebert_batch_size=128, hatebert_models={'hb_bert_offenseval': 'pretrained/hatebert/bert-offenseval', 'hb_bert_abuseval': 'pretrained/hatebert/bert-abuseval', 'hb_hatebert_offenseval': 'pretrained/hatebert/hatebert-offenseval', 'hb_hatebert_abuseval': 'pretrained/hatebert/hatebert-abuseval'})
NVIDIA GeForce GTX 1060 6GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
score_map: Dict[str, float] = {}
df = pd.read_csv("input/ruddit/Ruddit.csv", engine="c", low_memory=False)
for t in df.itertuples():
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    score_map[k] = getattr(t, "offensiveness_score")

In [5]:
%%time
df = pd.read_csv("input/ruddit/ruddit_with_text.csv", engine="c", low_memory=False)
blacklist = {"[deleted]", "[removed]"}
rows = []
for t in df.itertuples():
    text = getattr(t, "txt")
    s = text.strip().lower()
    if len(s)==0 or s in blacklist:
        continue
    k = getattr(t, "post_id") + "_" + getattr(t, "comment_id")
    rows.append({"bws": score_map[k], "text": text})
df = pd.DataFrame.from_records(rows)
df["bws"] = df["bws"].astype(np.float32) 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   bws     5710 non-null   float32
 1   text    5710 non-null   object 
dtypes: float32(1), object(1)
memory usage: 67.0+ KB
Wall time: 53 ms


In [6]:
scml.find_missing_values(df)

Unnamed: 0,Total,Percent,Type
bws,0,0.0,float32
text,0,0.0,object


# Severity rating label

In [7]:
# Force unique ranks
col = "label"
df[col] = rankdata(df["bws"], method='ordinal')
df[col] = df[col].astype(np.int32)

# Preprocess Text

In [8]:
def preprocess(row) -> str:
    return mylib.preprocess(row["text"])


col = "text"
df[col] = df.progress_apply(preprocess, axis=1)

100%|█████████████████████████████████████████| 5710/5710 [00:02<00:00, 2380.36it/s]


In [9]:
cols = ["bws", "text"]
df[cols].sample(40).sort_values("bws").head(40)

Unnamed: 0,bws,text
329,-0.63,"it is not a direct comparison, you are right."
1216,-0.604,Can you give us some examples of the food?
2610,-0.542,"Greetings r/headphones, I have the status audio OB-1 headphones (which are somic v2/mh463 rebrands, I found that out too late) and I do not like the ear pads, they are too shallow. I have no idea what good universal earpads are, can someone help me?"
4284,-0.521,didnt he actually do that though?
3217,-0.5,I never understood how to play minesweeper as a kid. Tbh I still do not understand
2040,-0.479,Bobby Ewing (from Dallas). I think I was like 5 when I watched that episode with my family.. still remember it like yesterday.
2279,-0.458,I try to have a have a treat waiting for me in the morning. Normally a breakfast food that i can look forward to
1473,-0.417,"Board games: Risk, Life, Monopoly, chess, checkers. I wish I could remember half the card games I used to play. Need a book of those for relevant occasions."
22,-0.354,OP is arguing that. Or at least that it is not a problem.
1454,-0.354,"Heck yes I would want to know. Granted I would probably ask first, it does not hurt to offer up that information."


# Character level features

In [10]:
%%time
col = "length"
df[col] = df["text"].str.len()
df[col] = df[col].astype(np.int16)

Wall time: 3.03 ms


In [11]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row["text"])


def letter_frac(row) -> float:
    return mylib.letter_frac(row["text"])


def space_frac(row) -> float:
    return mylib.space_frac(row["text"])


def punc_frac(row) -> float:
    return mylib.punc_frac(row["text"])


def upper_frac(row) -> float:
    return mylib.upper_frac(row["text"])

In [12]:
col = "digit_frac"
df[col] = df.progress_apply(digit_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 38614.98it/s]


In [13]:
col = "letter_frac"
df[col] = df.progress_apply(letter_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 38075.18it/s]


In [14]:
col = "space_frac"
df[col] = df.progress_apply(space_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 38320.76it/s]


In [15]:
col = "punc_frac"
df[col] = df.progress_apply(punc_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 35240.91it/s]


In [16]:
col = "upper_frac"
df[col] = df.progress_apply(upper_frac, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 39101.70it/s]


# Textstat features

In [17]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row["text"])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row["text"])


def sentence_count(row) -> int:
    return textstat.sentence_count(row["text"])


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row["text"])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row["text"])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row["text"])


def smog_index(row) -> float:
    return textstat.smog_index(row["text"])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row["text"])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row["text"])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row["text"])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row["text"])

In [18]:
col = "flesch_reading_ease"
df[col] = df.progress_apply(flesch_reading_ease, axis=1)
df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████| 5710/5710 [00:00<00:00, 6022.72it/s]


In [19]:
col = "flesch_kincaid_grade"
df[col] = df.progress_apply(flesch_kincaid_grade, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 12546.11it/s]


In [20]:
col = "syllable_count"
df[col] = df.progress_apply(syllable_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 19285.77it/s]


In [21]:
col = "lexicon_count"
df[col] = df.progress_apply(lexicon_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 63423.84it/s]


In [22]:
col = "sentence_count"
df[col] = df.progress_apply(sentence_count, axis=1)
df[col] = df[col].astype(np.int16)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 39551.65it/s]


In [23]:
col = "gunning_fog"
df[col] = df.progress_apply(gunning_fog, axis=1)
df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████| 5710/5710 [00:00<00:00, 9994.88it/s]


In [24]:
col = "smog_index"
df[col] = df.progress_apply(smog_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 16130.63it/s]


In [25]:
col = "automated_readability_index"
df[col] = df.progress_apply(automated_readability_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 27058.16it/s]


In [26]:
col = "coleman_liau_index"
df[col] = df.progress_apply(coleman_liau_index, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 23881.94it/s]


In [27]:
col = "linsear_write_formula"
df[col] = df.progress_apply(linsear_write_formula, axis=1)
df[col] = df[col].astype(np.float32)

100%|████████████████████████████████████████| 5710/5710 [00:00<00:00, 11444.78it/s]


In [28]:
col = "dale_chall_readability_score"
df[col] = df.progress_apply(dale_chall_readability_score, axis=1)
df[col] = df[col].astype(np.float32)

100%|█████████████████████████████████████████| 5710/5710 [00:00<00:00, 9930.65it/s]


# HateBert labels

In [29]:
sentences = list(df["text"])
# all Hatebert models use the same tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    conf.hatebert_models["hb_hatebert_offenseval"], 
    model_max_length=conf.hatebert_model_max_length
)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/hatebert/hatebert-offenseval', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']


In [30]:
%%time
x = tokenizer(sentences, truncation=True, padding="max_length")
print(f"{repr(x.keys())}\nlen={len(x['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=5710
Wall time: 1.07 s


In [31]:
batches = torch.utils.data.DataLoader(mylib.Dataset(x), batch_size=conf.hatebert_batch_size, shuffle=False)
for col, model_dir in conf.hatebert_models.items():    
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.eval()
    model.to(conf.device)
    logits = None
    with torch.no_grad():
        for batch in tqdm(batches):
            for k, v in batch.items():
                batch[k] = v.to(conf.device)
            outputs = model(**batch)
            tmp = outputs.logits.detach().cpu()
            if logits is None:
                logits = tmp
            else:
                logits = torch.cat((logits, tmp), 0)
    logits = torch.nn.functional.softmax(logits, dim=1)
    print(f"{logits.size()}\nlogits[:10]={logits[:10]}")
    df[col] = logits[:,1]
    df[col] = df[col].astype(np.float32)

100%|███████████████████████████████████████████████| 45/45 [04:12<00:00,  5.60s/it]


torch.Size([5710, 2])
logits[:10]=tensor([[0.9598, 0.0402],
        [0.7718, 0.2282],
        [0.7567, 0.2433],
        [0.9064, 0.0936],
        [0.8356, 0.1644],
        [0.7731, 0.2269],
        [0.5952, 0.4048],
        [0.8502, 0.1498],
        [0.7770, 0.2230],
        [0.9258, 0.0742]])


100%|███████████████████████████████████████████████| 45/45 [04:11<00:00,  5.59s/it]


torch.Size([5710, 2])
logits[:10]=tensor([[0.9910, 0.0090],
        [0.9717, 0.0283],
        [0.9545, 0.0455],
        [0.9631, 0.0369],
        [0.9727, 0.0273],
        [0.8744, 0.1256],
        [0.9001, 0.0999],
        [0.8544, 0.1456],
        [0.8708, 0.1292],
        [0.9881, 0.0119]])


100%|███████████████████████████████████████████████| 45/45 [04:13<00:00,  5.63s/it]


torch.Size([5710, 2])
logits[:10]=tensor([[0.9377, 0.0623],
        [0.5526, 0.4474],
        [0.7553, 0.2447],
        [0.7828, 0.2172],
        [0.8740, 0.1260],
        [0.9279, 0.0721],
        [0.8304, 0.1696],
        [0.7886, 0.2114],
        [0.8038, 0.1962],
        [0.8965, 0.1035]])


100%|███████████████████████████████████████████████| 45/45 [04:11<00:00,  5.59s/it]

torch.Size([5710, 2])
logits[:10]=tensor([[0.9846, 0.0154],
        [0.9717, 0.0283],
        [0.9570, 0.0430],
        [0.9699, 0.0301],
        [0.9700, 0.0300],
        [0.9775, 0.0225],
        [0.9800, 0.0200],
        [0.9733, 0.0267],
        [0.9407, 0.0593],
        [0.9795, 0.0205]])





# Detoxify labels

In [32]:
batch_size = 256
model_max_length = 512
dtfy_fs = []

In [33]:
%%time
prefix = "dto_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint="pretrained/unitaryai/detoxify/toxic_original-c1212f89.ckpt",
    config_dir="pretrained/bert-base-uncased",
    model_max_length=model_max_length,
    device=conf.device,
    batch_size=batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

Wall time: 1min 18s


13

In [34]:
%%time
prefix = "dtu_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint="pretrained/unitaryai/detoxify/toxic_debiased-c7548aa0.ckpt",
    config_dir="pretrained/roberta-base",
    model_max_length=model_max_length,
    device=conf.device,
    batch_size=batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

Wall time: 1min 14s


4

In [35]:
%%time
prefix = "dtm_"
res = mylib.detoxify_labels(
    sentences,
    checkpoint="pretrained/unitaryai/detoxify/multilingual_debiased-0b549669.ckpt",
    config_dir="pretrained/xlm-roberta-base",
    model_max_length=model_max_length,
    device=conf.device,
    batch_size=batch_size
)
for k, v in res.items():
    col = prefix + k
    df[col] = v
    df[col] = df[col].astype(np.float32)
    dtfy_fs.append(col)
gc.collect()

Wall time: 1min 23s


0

In [36]:
print(dtfy_fs)

['dto_toxicity', 'dto_severe_toxicity', 'dto_obscene', 'dto_threat', 'dto_insult', 'dto_identity_attack', 'dtu_toxicity', 'dtu_severe_toxicity', 'dtu_obscene', 'dtu_identity_attack', 'dtu_insult', 'dtu_threat', 'dtu_sexual_explicit', 'dtm_toxicity', 'dtm_severe_toxicity', 'dtm_obscene', 'dtm_identity_attack', 'dtm_insult', 'dtm_threat', 'dtm_sexual_explicit']


# Embeddings

In [37]:
model = SentenceTransformer("pretrained/sentence-transformers/paraphrase-MiniLM-L6-v2", device=conf.device)
model.max_seq_length = 128
em = model.encode(sentences=sentences, batch_size=1000, show_progress_bar=True, convert_to_numpy=True)
print(f"em.shape={em.shape}")

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

em.shape=(5710, 384)


In [38]:
%%time
em_size = em.shape[1]
em_cols = [f"zz{i:04d}" for i in range(em_size)]
df[em_cols] = em
df[em_cols] = df[em_cols].astype(np.float32)
del sentences

Wall time: 240 ms


  self[col] = igetitem(value, i)


# Review data

In [39]:
col = "worker"
df[col] = 0
df[col] = df[col].astype(np.int8)
cols = ["label", "bws", "worker"]
char_fs = ["length", "digit_frac", "letter_frac", "space_frac", "punc_frac", "upper_frac"]
textstat_fs = ["syllable_count", "lexicon_count", "sentence_count", "flesch_reading_ease", 
           "flesch_kincaid_grade", "gunning_fog", "smog_index", "automated_readability_index", 
           "coleman_liau_index", "linsear_write_formula", "dale_chall_readability_score"]
cols += char_fs + textstat_fs + dtfy_fs + list(conf.hatebert_models.keys())
df[cols].describe(percentiles=percentiles)

Unnamed: 0,label,bws,worker,length,digit_frac,letter_frac,space_frac,punc_frac,upper_frac,syllable_count,lexicon_count,sentence_count,flesch_reading_ease,flesch_kincaid_grade,gunning_fog,smog_index,automated_readability_index,coleman_liau_index,linsear_write_formula,dale_chall_readability_score,dto_toxicity,dto_severe_toxicity,dto_obscene,dto_threat,dto_insult,dto_identity_attack,dtu_toxicity,dtu_severe_toxicity,dtu_obscene,dtu_identity_attack,dtu_insult,dtu_threat,dtu_sexual_explicit,dtm_toxicity,dtm_severe_toxicity,dtm_obscene,dtm_identity_attack,dtm_insult,dtm_threat,dtm_sexual_explicit,hb_bert_offenseval,hb_bert_abuseval,hb_hatebert_offenseval,hb_hatebert_abuseval
count,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0,5710.0
mean,2855.5,-0.027706,0.0,197.564098,0.003542,0.78883,0.177722,0.029905,0.030517,50.067426,36.371278,2.31979,75.253105,6.685254,8.998004,3.062995,7.482505,6.631683,8.562484,8.299163,0.177856,0.01308,0.113673,0.008411,0.059683,0.011454,0.195873,0.004698273,0.113027,0.015234,0.081492,0.01101,0.040167,0.203075,0.005742,0.105158,0.013474,0.083563,0.014735,0.044618,0.332577,0.166173,0.329977,0.144314
std,1648.47935,0.334195,0.0,172.016744,0.012983,0.034318,0.021473,0.02407,0.049035,44.035705,31.19425,1.722369,19.903004,4.430519,4.64589,4.695458,5.593331,3.883116,5.90646,2.270676,0.326334,0.052992,0.282462,0.056317,0.175381,0.064708,0.341383,0.02272458,0.285412,0.074354,0.20778,0.069685,0.149433,0.345334,0.027226,0.272925,0.075226,0.214901,0.078828,0.155314,0.332031,0.278667,0.284761,0.221301
min,1.0,-0.889,0.0,15.0,0.0,0.4,0.040541,0.0,0.0,3.0,2.0,1.0,-48.98,-2.5,0.8,0.0,-8.7,-10.16,0.0,0.2,0.000506,8e-05,0.000141,8.6e-05,0.000164,0.000121,0.000286,9.52907e-07,1.7e-05,5.2e-05,6.1e-05,1.2e-05,9e-06,0.000178,9e-06,6.3e-05,4.5e-05,9.5e-05,1.4e-05,1.2e-05,0.00886,0.002739,0.007778,0.007624
1%,58.09,-0.667,0.0,24.0,0.0,0.672759,0.117647,0.0,0.0,6.0,5.0,1.0,19.430301,-1.5,2.0,0.0,-3.0,-3.0037,1.5,0.35,0.000556,8.7e-05,0.000158,9.6e-05,0.000171,0.000134,0.00036,1.087282e-06,2.1e-05,6.3e-05,9.2e-05,1.5e-05,1.1e-05,0.000281,1.3e-05,9.5e-05,6.6e-05,0.000149,2.1e-05,1.7e-05,0.013618,0.003827,0.021895,0.010283
5%,286.45,-0.521,0.0,33.0,0.0,0.733333,0.141414,0.0,0.005062,8.0,6.0,1.0,41.869999,0.5,2.4,0.0,-0.2,0.32,2.0,5.62,0.000614,9.3e-05,0.000165,0.000103,0.000175,0.000138,0.00041,1.200621e-06,2.4e-05,7e-05,0.000103,1.7e-05,1.2e-05,0.000357,1.6e-05,0.000115,7.8e-05,0.000177,2.7e-05,2e-05,0.02135,0.005177,0.036465,0.013695
10%,571.9,-0.426,0.0,42.0,0.0,0.75,0.151515,0.010417,0.008333,10.0,8.0,1.0,50.669998,1.7,3.2,0.0,1.29,1.82,2.741667,6.24,0.000668,9.7e-05,0.00017,0.000108,0.000177,0.00014,0.000454,1.271093e-06,2.6e-05,7.6e-05,0.000111,1.9e-05,1.3e-05,0.000417,1.9e-05,0.000129,8.6e-05,0.0002,3.1e-05,2.2e-05,0.029108,0.00643,0.049953,0.016057
20%,1142.8,-0.312,0.0,60.0,0.0,0.769841,0.161966,0.015385,0.011758,15.0,11.0,1.0,59.98,3.1,4.87,0.0,3.2,3.698,4.0,6.93,0.000797,0.000103,0.000176,0.000115,0.000181,0.000145,0.000572,1.407266e-06,3.1e-05,8.9e-05,0.000133,2.3e-05,1.6e-05,0.000578,2.3e-05,0.000159,9.9e-05,0.000247,3.7e-05,2.6e-05,0.045445,0.009145,0.078526,0.020993
30%,1713.7,-0.213,0.0,82.0,0.0,0.779661,0.168627,0.018692,0.014599,20.0,15.0,1.0,66.739998,4.4,6.364,0.0,4.6,4.93,5.0,7.38,0.001006,0.000108,0.000183,0.000121,0.000188,0.000151,0.00085,1.645227e-06,4.1e-05,0.000113,0.000177,3e-05,2.1e-05,0.000857,2.7e-05,0.000201,0.000118,0.000327,4.6e-05,3.2e-05,0.071743,0.012743,0.11306,0.027239
40%,2284.6,-0.146,0.0,106.0,0.0,0.786537,0.173913,0.021652,0.017857,27.0,20.0,1.0,72.144003,5.3,8.0,0.0,5.8,5.9,6.0,7.82,0.001533,0.000114,0.000196,0.000127,0.000207,0.000165,0.001636,2.303226e-06,6.7e-05,0.000175,0.000311,4.8e-05,3.3e-05,0.001655,3.3e-05,0.000272,0.000152,0.000511,6.5e-05,4.5e-05,0.108833,0.018178,0.157839,0.035502


In [40]:
cols += em_cols
df[cols].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5710 entries, 0 to 5709
Data columns (total 428 columns):
 #    Column                        Non-Null Count  Dtype  
---   ------                        --------------  -----  
 0    label                         5710 non-null   int32  
 1    bws                           5710 non-null   float32
 2    worker                        5710 non-null   int8   
 3    length                        5710 non-null   int16  
 4    digit_frac                    5710 non-null   float32
 5    letter_frac                   5710 non-null   float32
 6    space_frac                    5710 non-null   float32
 7    punc_frac                     5710 non-null   float32
 8    upper_frac                    5710 non-null   float32
 9    syllable_count                5710 non-null   int16  
 10   lexicon_count                 5710 non-null   int16  
 11   sentence_count                5710 non-null   int16  
 12   flesch_reading_ease           5710 non-null   

In [41]:
%%time
df[cols].to_parquet("output/ruddit.parquet", index=False)

Wall time: 224 ms
