In [1]:
import os
import json
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Dict, List, Set, Tuple, NamedTuple, Callable, Any
import textstat
import scipy
import scml
from scml import pandasx as pdx
from daigt.preprocess import en as pen
from daigt.features import fit_tokenizer
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
df = pd.read_parquet("input/white.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43529 entries, 0 to 43528
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   essay_id        43529 non-null  int32  
 1   generated       43529 non-null  int8   
 2   source          43529 non-null  object 
 3   prompt          43529 non-null  object 
 4   text            43529 non-null  object 
 5   text_bsc        43529 non-null  object 
 6   text_bow        43529 non-null  object 
 7   text_bow_len    43529 non-null  int16  
 8   prompt_bsc      43529 non-null  object 
 9   prompt_bow      43529 non-null  object 
 10  prompt_bow_len  43529 non-null  int16  
 11  white_sim       43529 non-null  float32
dtypes: float32(1), int16(2), int32(1), int8(1), object(7)
memory usage: 2.9+ MB


# Character level features

In [3]:
%%time
text_col = "text_bsc"
col = "ch_len"
df[col] = df[text_col].str.len()
df[col] = df[col].astype(np.int32)

def digit_frac(row) -> float:
    return pen.digit_frac(row[text_col])


def letter_frac(row) -> float:
    return pen.letter_frac(row[text_col])


def space_frac(row) -> float:
    return pen.space_frac(row[text_col])


def punc_frac(row) -> float:
    return pen.punc_frac(row[text_col])


def upper_frac(row) -> float:
    return pen.upper_frac(row[text_col])


def repeat_char_frac(row) -> float:
    return pen.repeat_char_frac(row[text_col])


def repeat_substring_frac(row) -> float:
    return pen.repeat_substring_frac(row[text_col])


char_fns: Dict[str, Callable] = {
    "ch_digit_frac": digit_frac,
    "ch_letter_frac": letter_frac,
    "ch_space_frac": space_frac,
    "ch_punc_frac": punc_frac,
    "ch_upper_frac": upper_frac,
    "ch_repeat_char_frac": repeat_char_frac,
    #"ch_repeat_substring_frac": repeat_substring_frac,
}

for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ch_digit_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:01<00:00, 22383.54it/s]


ch_letter_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:02<00:00, 20250.25it/s]


ch_space_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:02<00:00, 21659.27it/s]


ch_punc_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:01<00:00, 21873.17it/s]


ch_upper_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:01<00:00, 22168.33it/s]


ch_repeat_char_frac


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:05<00:00, 7415.04it/s]

CPU times: user 15.8 s, sys: 97.9 ms, total: 15.9 s
Wall time: 16 s





# Word and Sentence Features

In [4]:
text_col = "text_bsc"

def sent_len_mean(row) -> float:
    return pen.sentence_length_mean(row[text_col])


def sent_len_std(row) -> float:
    return pen.sentence_length_std(row[text_col])


def sent_len_delta_mean(row) -> float:
    return pen.sentence_length_delta_mean(row[text_col])


def sent_len_delta_std(row) -> float:
    return pen.sentence_length_delta_std(row[text_col])


fmap: Dict[str, Callable] = {
    "ws_sent_len_mean": sent_len_mean,
    "ws_sent_len_std": sent_len_std,
    "ws_sent_len_delta_mean": sent_len_delta_mean,
    "ws_sent_len_delta_std": sent_len_delta_std,
}
for col, fn in fmap.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ws_sent_len_mean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:03<00:00, 11969.81it/s]


ws_sent_len_std


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:04<00:00, 10096.60it/s]


ws_sent_len_delta_mean


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:03<00:00, 11853.25it/s]


ws_sent_len_delta_std


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:04<00:00, 10054.72it/s]


# Textstat Features

In [5]:
text_col = "text_bsc"

def syllable_count(row) -> int:
    return textstat.syllable_count(row[text_col])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row[text_col])


def sentence_count(row) -> int:
    return textstat.sentence_count(row[text_col])


def syllables_per_word(row) -> float:
    return row["ts_syllable_count"] / (row["ts_lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["ts_syllable_count"] / (row["ts_sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["ts_lexicon_count"] / (row["ts_sentence_count"] + 1)
    

def polysyllable_frac(row) -> float:
    return textstat.polysyllabcount(row[text_col]) / (row["ts_lexicon_count"] + 1)


def monosyllable_frac(row) -> float:
    return textstat.monosyllabcount(row[text_col]) / (row["ts_lexicon_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row[text_col])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row[text_col])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row[text_col])


def smog_index(row) -> float:
    return textstat.smog_index(row[text_col])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row[text_col])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row[text_col])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row[text_col])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row[text_col])


def difficult_words(row) -> float:
    return textstat.difficult_words(row[text_col])


def spache_readability(row) -> float:
    return textstat.spache_readability(row[text_col])


def mcalpine_eflaw(row) -> float:
    return textstat.mcalpine_eflaw(row[text_col])


stage1: List[Tuple[str, Callable, Any]] = [
    ("ts_syllable_count", syllable_count, np.int32),
    ("ts_lexicon_count", lexicon_count, np.int32),
    ("ts_sentence_count", sentence_count, np.int32),
]
stage2: List[Tuple[str, Callable, Any]] = [
    ("ts_syllables_per_word", syllables_per_word, np.float32),
    ("ts_syllables_per_sent", syllables_per_sent, np.float32),
    ("ts_words_per_sent", words_per_sent, np.float32),
    ("ts_polysyllable_frac", polysyllable_frac, np.float32),
    ("ts_monosyllable_frac", monosyllable_frac, np.float32),
    ("ts_flesch_reading_ease", flesch_reading_ease, np.float32),
    ("ts_flesch_kincaid_grade", flesch_kincaid_grade, np.float32),
    ("ts_gunning_fog", gunning_fog, np.float32),
    ("ts_smog_index", smog_index, np.float32),
    ("ts_automated_readability_index", automated_readability_index, np.float32),
    ("ts_coleman_liau_index", coleman_liau_index, np.float32),
    ("ts_linsear_write_formula", linsear_write_formula, np.float32),
    ("ts_dale_chall_readability_score", dale_chall_readability_score, np.float32),
    ("ts_difficult_words", difficult_words, np.float32),
    ("ts_spache_readability", spache_readability, np.float32),
    ("ts_mcalpine_eflaw", mcalpine_eflaw, np.float32),
]
for col, fn, dtype in stage1:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)
for col, fn, dtype in stage2:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)

ts_syllable_count


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:07<00:00, 5910.63it/s]


ts_lexicon_count


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:01<00:00, 34110.35it/s]


ts_sentence_count


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:02<00:00, 18396.44it/s]


ts_syllables_per_word


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:00<00:00, 371543.47it/s]


ts_syllables_per_sent


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:00<00:00, 354855.49it/s]


ts_words_per_sent


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:00<00:00, 376480.53it/s]


ts_polysyllable_frac


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:08<00:00, 4954.16it/s]


ts_monosyllable_frac


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:09<00:00, 4553.86it/s]


ts_flesch_reading_ease


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:10<00:00, 4052.05it/s]


ts_flesch_kincaid_grade


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:10<00:00, 4163.73it/s]


ts_gunning_fog


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:11<00:00, 3686.51it/s]


ts_smog_index


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:11<00:00, 3664.02it/s]


ts_automated_readability_index


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:05<00:00, 8251.76it/s]


ts_coleman_liau_index


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:05<00:00, 7618.12it/s]


ts_linsear_write_formula


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:03<00:00, 11729.12it/s]


ts_dale_chall_readability_score


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:11<00:00, 3919.98it/s]


ts_difficult_words


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:07<00:00, 5839.56it/s]


ts_spache_readability


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:11<00:00, 3905.75it/s]


ts_mcalpine_eflaw


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:04<00:00, 9833.46it/s]


# VAD Features

In [6]:
vdf = pd.read_csv("input/NRC-VAD-Lexicon/NRC-VAD-Lexicon.txt", header=0, names=["term", "valence", "arousal", "dominance"], 
                 sep="\t", engine="c", low_memory=False)
cols = ["valence", "arousal", "dominance"]
vdf[cols] = vdf[cols].astype(np.float32)
vdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19970 entries, 0 to 19969
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   term       19969 non-null  object 
 1   valence    19970 non-null  float32
 2   arousal    19970 non-null  float32
 3   dominance  19970 non-null  float32
dtypes: float32(3), object(1)
memory usage: 390.2+ KB


In [7]:
vad: Dict[str, Tuple] = {}
for t in tqdm(vdf.itertuples()):
    v = getattr(t, "valence")
    a = getattr(t, "arousal")
    d = getattr(t, "dominance")
    t = str(getattr(t, "term")).strip().lower()
    vad[t] = (v, a, d)

19970it [00:00, 1353285.47it/s]


In [8]:
cols = ["va_valence_mean", "va_valence_std", "va_arousal_mean", "va_arousal_std", "va_dominance_mean", "va_dominance_std"]
rows = []
for text in tqdm(df["text_bow"]):
    vas, ars, dos = [], [], []
    tokens = text.split()
    for t in tokens:
        if t in vad:
            v, a, d = vad[t]
            vas.append(v)
            ars.append(a)
            dos.append(d)
    if len(vas) == 0:
        vas = [-1]
    if len(ars) == 0:
        ars = [-1]
    if len(dos)==0:
        dos = [-1]
    rows.append([np.mean(vas), np.std(vas), np.mean(ars), np.std(ars), np.mean(dos), np.std(dos)])
df[cols] = rows
df[cols] = df[cols].astype(np.float32)
del rows, vdf, vad
gc.collect()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43529/43529 [00:04<00:00, 10414.95it/s]


0

# TF-IDF Features

In [9]:
%%time
corpus = df["text_bow"].tolist()
tokenizer = fit_tokenizer(corpus=corpus)
dp = "output/tokenizer"
Path(dp).mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(dp)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")




PreTrainedTokenizerFast(name_or_path='', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']
CPU times: user 31.5 s, sys: 223 ms, total: 31.

In [10]:
%%time
tokenized = [tokenizer.tokenize(x) for x in corpus]
print(tokenized[0])

['Ġcars', 'Ġcars', 'Ġhave', 'Ġbeen', 'Ġaround', 'Ġsince', 'Ġthey', 'Ġbecame', 'Ġfamous', 'Ġin', 'Ġthe', 'Ġ1900', 's', 'Ġwhen', 'Ġhenry', 'Ġford', 'Ġcreated', 'Ġand', 'Ġbuilt', 'Ġthe', 'Ġfirst', 'Ġmodelt', 'Ġcars', 'Ġhave', 'Ġplayed', 'Ġa', 'Ġmajor', 'Ġrole', 'Ġin', 'Ġour', 'Ġevery', 'Ġday', 'Ġlives', 'Ġsince', 'Ġthen', 'Ġbut', 'Ġnow', 'Ġpeople', 'Ġare', 'Ġstarting', 'Ġto', 'Ġquestion', 'Ġif', 'Ġlimiting', 'Ġcar', 'Ġusage', 'Ġwould', 'Ġbe', 'Ġa', 'Ġgood', 'Ġthing', 'Ġto', 'Ġme', 'Ġlimiting', 'Ġthe', 'Ġuse', 'Ġof', 'Ġcars', 'Ġmight', 'Ġbe', 'Ġa', 'Ġgood', 'Ġthing', 'Ġto', 'Ġdo', 'Ġin', 'Ġlike', 'Ġmatter', 'Ġof', 'Ġthis', 'Ġarticle', 'Ġin', 'Ġgerman', 'Ġsuburb', 'Ġlife', 'Ġgoes', 'Ġon', 'Ġwithout', 'Ġcars', 'Ġby', 'Ġelizabeth', 'Ġrosenthal', 'Ġstates', 'Ġhow', 'Ġautomobiles', 'Ġare', 'Ġthe', 'Ġlinchpin', 'Ġof', 'Ġsuburbs', 'Ġwhere', 'Ġmiddle', 'Ġclass', 'Ġfamilies', 'Ġfrom', 'Ġeither', 'Ġshanghai', 'Ġor', 'Ġchicago', 'Ġtend', 'Ġto', 'Ġmake', 'Ġtheir', 'Ġhomes', 'Ġexperts', 'Ġsay', 'Ġhow',

In [11]:
%%time
vectorizer = TfidfVectorizer(
    ngram_range=(1,1), 
    lowercase=False, 
    sublinear_tf=True, 
    analyzer = 'word',
    tokenizer = lambda x: x,
    preprocessor = lambda x: x,
    token_pattern =None, 
    strip_accents=None
)
x = vectorizer.fit_transform(tokenized)
print(f"x={x.shape}")

x=(43529, 29787)
CPU times: user 2.16 s, sys: 53.8 ms, total: 2.21 s
Wall time: 2.22 s


In [12]:
%%time
cols = [f"tf_{f}" for f in vectorizer.get_feature_names_out()]
df[cols] = scipy.sparse.csr_matrix(x).todense().astype(np.float32)
df = df.copy()  # defragment

CPU times: user 27.6 s, sys: 5.2 s, total: 32.8 s
Wall time: 37.6 s


In [13]:
vocab = vectorizer.vocabulary_
print(f"{len(vocab):,} vocab")
idf = vectorizer.idf_
print(f"idf={idf.shape}")
with open("output/tfidf.json", "w") as f:
    json.dump({"vocabulary": vocab, "idf": idf.tolist()}, f)

29,787 vocab
idf=(29787,)


In [14]:
del vectorizer, corpus, vocab, idf
gc.collect()

32

# Review Data

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43529 entries, 0 to 43528
Columns: 29835 entries, essay_id to tf_Ġzygomatic
dtypes: float32(29820), int16(2), int32(5), int8(1), object(7)
memory usage: 4.8+ GB


In [16]:
features = []
prefixes = ["ch_", "ws_", "ts_", "va_", "tf_Ġhe"]
for col in df.columns:
    for prefix in prefixes:
        if col.startswith(prefix):
            features.append(col)
features.sort()
df[features].describe(percentiles=percentiles)

Unnamed: 0,ch_digit_frac,ch_len,ch_letter_frac,ch_punc_frac,ch_repeat_char_frac,ch_space_frac,ch_upper_frac,tf_Ġhe,tf_Ġhead,tf_Ġheadache,tf_Ġheadaches,tf_Ġheaded,tf_Ġheading,tf_Ġheadlights,tf_Ġheadline,tf_Ġheadlines,tf_Ġheadlong,tf_Ġheadphones,tf_Ġheadqu,tf_Ġheadquarters,tf_Ġheads,tf_Ġheadset,tf_Ġheadsets,tf_Ġheadstart,tf_Ġheal,tf_Ġhealed,tf_Ġhealing,tf_Ġhealth,tf_Ġhealthcare,tf_Ġhealthe,tf_Ġhealtheier,tf_Ġhealtheir,tf_Ġhealther,tf_Ġhealthier,tf_Ġhealthiest,tf_Ġhealthily,tf_Ġhealthiness,tf_Ġhealthy,tf_Ġhealtier,tf_Ġhealty,tf_Ġheap,tf_Ġhear,tf_Ġheard,tf_Ġhearing,tf_Ġhears,tf_Ġheart,tf_Ġheartache,tf_Ġheartbeat,tf_Ġheartbreaking,tf_Ġheartbroken,tf_Ġhearted,tf_Ġheartedly,tf_Ġheartless,tf_Ġhearts,tf_Ġheartwarming,tf_Ġheat,tf_Ġheated,tf_Ġheath,tf_Ġheathier,tf_Ġheathly,tf_Ġheathy,tf_Ġheating,tf_Ġheats,tf_Ġheatwaves,tf_Ġheav,tf_Ġheave,tf_Ġheaven,tf_Ġheavier,tf_Ġheavily,tf_Ġheavy,tf_Ġheck,tf_Ġhectic,tf_Ġhed,tf_Ġheed,tf_Ġhefty,tf_Ġheid,tf_Ġheidr,tf_Ġheidren,tf_Ġheidrum,tf_Ġheidrun,tf_Ġheight,tf_Ġheighten,tf_Ġheightened,tf_Ġheightens,tf_Ġheights,tf_Ġheir,tf_Ġhel,tf_Ġheld,tf_Ġhelf,tf_Ġhelic,tf_Ġhelicopter,tf_Ġhell,tf_Ġhellish,tf_Ġhello,tf_Ġhelm,tf_Ġhelo,tf_Ġhelp,tf_Ġhelpe,tf_Ġhelped,tf_Ġhelper,tf_Ġhelpers,tf_Ġhelpes,tf_Ġhelpful,tf_Ġhelpfull,tf_Ġhelpfully,tf_Ġhelpfulness,tf_Ġhelpin,tf_Ġhelping,tf_Ġhelpless,tf_Ġhelplessness,tf_Ġhelps,tf_Ġhelthier,tf_Ġhem,tf_Ġhemat,tf_Ġhen,tf_Ġhence,tf_Ġhenceforth,tf_Ġhenry,tf_Ġhep,tf_Ġhepl,tf_Ġhepled,tf_Ġhepls,tf_Ġher,tf_Ġherd,tf_Ġherding,tf_Ġherds,tf_Ġhere,tf_Ġhereby,tf_Ġheres,tf_Ġheritage,tf_Ġhero,tf_Ġheroes,tf_Ġheroic,tf_Ġheroin,tf_Ġheros,tf_Ġhers,tf_Ġherself,tf_Ġhes,tf_Ġhese,tf_Ġheshe,tf_Ġhesitant,tf_Ġhesitate,tf_Ġhesitation,tf_Ġhet,tf_Ġhev,tf_Ġhew,tf_Ġhey,ts_automated_readability_index,ts_coleman_liau_index,ts_dale_chall_readability_score,ts_difficult_words,ts_flesch_kincaid_grade,ts_flesch_reading_ease,ts_gunning_fog,ts_lexicon_count,ts_linsear_write_formula,ts_mcalpine_eflaw,ts_monosyllable_frac,ts_polysyllable_frac,ts_sentence_count,ts_smog_index,ts_spache_readability,ts_syllable_count,ts_syllables_per_sent,ts_syllables_per_word,ts_words_per_sent,va_arousal_mean,va_arousal_std,va_dominance_mean,va_dominance_std,va_valence_mean,va_valence_std,ws_sent_len_delta_mean,ws_sent_len_delta_std,ws_sent_len_mean,ws_sent_len_std
count,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0,43529.0
mean,0.001445,2215.16667,0.801743,0.020859,0.017128,0.175953,0.015311,0.011612,0.002213,8.3e-05,0.000153,0.000263,0.000227,2.2e-05,2.8e-05,4.2e-05,1.6e-05,0.000121,1.6e-05,3.4e-05,0.000615,9e-05,7.4e-05,1.9e-05,0.000119,2.1e-05,7.3e-05,0.00562,0.000462,1.1e-05,1.6e-05,2.8e-05,2.8e-05,0.002254,6.4e-05,3.2e-05,1.6e-05,0.003335,1.7e-05,5.4e-05,3.9e-05,0.002395,0.002237,0.001209,9.8e-05,0.001421,1.9e-05,4.2e-05,4.2e-05,2.7e-05,8.3e-05,9.5e-05,2e-05,0.000191,2.1e-05,0.001153,8.3e-05,8.7e-05,4.2e-05,2.5e-05,3.5e-05,9.1e-05,2.5e-05,3e-05,2.5e-05,1.9e-05,7.1e-05,7.9e-05,0.000935,0.000656,0.000122,0.000175,2e-05,4.7e-05,6.5e-05,2.3e-05,2.4e-05,1.5e-05,0.000121,0.00089,0.00059,3.6e-05,8.9e-05,1.7e-05,0.00016,5.8e-05,9.7e-05,0.001212,1.2e-05,2e-06,5.7e-05,0.00012,3.8e-05,0.000538,1.5e-05,1.9e-05,0.022826,3e-05,0.003219,0.000108,4.6e-05,1.9e-05,0.00478,0.000153,2.9e-05,6.3e-05,2.4e-05,0.006193,8.8e-05,4.3e-05,0.007234,2.2e-05,5.5e-05,1.1e-05,4.1e-05,0.000195,2.2e-05,6e-05,0.000116,4.6e-05,1.6e-05,3.2e-05,0.00634,0.000122,7.4e-05,1.4e-05,0.005336,1.5e-05,0.000118,0.000126,0.000209,3.5e-05,1.9e-05,5e-05,1.5e-05,0.000131,0.000422,4.2e-05,2.1e-05,4.8e-05,0.000324,0.000146,6.9e-05,1.8e-05,2.2e-05,2.5e-05,0.003334,11.394773,9.308725,7.391437,48.213787,9.537492,64.520241,10.867484,388.173379,11.163964,29.714437,0.709186,0.10019,19.82414,10.94948,4.945193,552.469158,27.959562,1.423175,19.693209,0.443105,0.147989,0.565926,0.158486,0.64828,0.173153,10.801738,8.017398,21.917425,9.487012
std,0.002908,972.667736,0.016763,0.008192,0.005058,0.014197,0.015981,0.029766,0.013301,0.003342,0.004163,0.005158,0.004986,0.001781,0.002067,0.002426,0.001517,0.00403,0.001304,0.002113,0.007632,0.003245,0.00311,0.001879,0.003963,0.00169,0.002926,0.021056,0.007591,0.001266,0.001692,0.002166,0.001976,0.015189,0.003011,0.002179,0.001562,0.017723,0.001666,0.002822,0.002143,0.014829,0.013324,0.011156,0.003592,0.01101,0.001836,0.00237,0.00231,0.002144,0.003237,0.003423,0.001857,0.004554,0.002082,0.010277,0.003019,0.003416,0.00288,0.00258,0.002125,0.003236,0.001904,0.001812,0.001788,0.001667,0.002832,0.003175,0.008654,0.007507,0.004041,0.004695,0.001591,0.002678,0.002702,0.001544,0.001689,0.001281,0.003504,0.007784,0.007471,0.002157,0.00334,0.001434,0.004481,0.002787,0.003558,0.010222,0.001937,0.000503,0.003031,0.003746,0.00204,0.007591,0.001498,0.001762,0.028538,0.002592,0.015976,0.004102,0.002543,0.001647,0.01916,0.00488,0.002435,0.003373,0.001821,0.022348,0.003464,0.002666,0.022278,0.001945,0.002752,0.001571,0.002285,0.004444,0.001933,0.002685,0.003972,0.002634,0.001523,0.002089,0.023326,0.004165,0.003096,0.001291,0.018245,0.001457,0.003726,0.004195,0.005689,0.002241,0.001698,0.002582,0.001372,0.003888,0.006625,0.002317,0.0017,0.00223,0.006039,0.00393,0.002881,0.001543,0.00174,0.002137,0.01618,6.857967,2.75337,1.273843,27.11952,5.435087,18.290529,5.487936,169.51834,4.897887,19.704504,0.080358,0.053046,8.760654,2.330713,1.931602,242.159979,11.332106,0.156338,8.721013,0.029466,0.017812,0.037862,0.017603,0.038352,0.027736,10.547448,5.97628,14.220164,7.210461
min,0.0,773.0,0.593537,0.0,0.00089,0.123756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,1.19,0.88,2.0,0.6,-628.880005,3.3,113.0,2.6,9.8,0.367925,0.0,1.0,0.0,2.26,177.0,8.909091,1.044855,7.148936,0.284773,0.071186,0.417888,0.074543,0.468342,0.085901,0.0,0.0,4.259259,0.0
1%,0.0,853.0,0.760402,0.007393,0.007073,0.140428,0.002551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1,4.34,5.44,9.0,3.5,24.48,5.49,154.0,4.555555,15.3,0.494634,0.018605,5.0,6.3,3.03,212.0,13.5,1.166267,10.435544,0.363471,0.105877,0.490867,0.119532,0.555599,0.115425,3.652174,2.321767,10.723374,3.43532
5%,0.0,1010.0,0.77502,0.011016,0.009543,0.150965,0.006373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.8,5.51,5.86,15.0,4.9,37.400002,6.64,183.0,5.75,18.4,0.561118,0.033755,8.0,7.6,3.44,250.0,16.576923,1.218374,12.534286,0.396832,0.119165,0.510062,0.131522,0.585618,0.130229,4.769231,3.263263,13.132807,4.358899
10%,0.0,1170.0,0.781438,0.013002,0.011024,0.156331,0.00748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.7,6.15,6.08,19.0,5.7,44.07,7.33,211.0,6.5,20.0,0.596899,0.043207,10.0,8.2,3.69,290.0,18.5,1.249582,13.727273,0.408852,0.126106,0.520461,0.137485,0.600507,0.138647,5.444445,3.779323,14.533334,4.91165
20%,0.0,1431.0,0.788498,0.015286,0.012912,0.163957,0.008792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,7.01,6.41,25.0,6.7,52.290001,8.26,255.0,7.571429,22.299999,0.643987,0.056306,13.0,9.0,4.03,355.0,21.142857,1.292971,15.263158,0.420732,0.133913,0.533104,0.14472,0.616913,0.149165,6.38371,4.509103,16.307692,5.706137
30%,0.0,1648.0,0.793193,0.016816,0.014358,0.169312,0.010044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7.65,6.69,31.0,7.6,58.549999,9.02,289.0,8.333333,24.0,0.674023,0.06713,15.0,9.6,4.3,408.0,23.200001,1.326316,16.4,0.428997,0.139104,0.543242,0.149768,0.628412,0.157177,7.173913,5.147587,17.61685,6.391748
40%,0.0,1846.0,0.79721,0.018164,0.015584,0.173413,0.01149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.9,8.25,6.96,37.0,8.3,62.880001,9.69,321.0,9.0,25.5,0.697842,0.077348,17.0,10.2,4.55,459.0,25.043671,1.359259,17.423077,0.435987,0.143481,0.55282,0.153855,0.638414,0.164487,7.94623,5.78256,18.76923,7.073032


In [17]:
%%time
df.to_parquet(f"output/features.parquet", index=False)
assert df.notna().all(axis=None)

CPU times: user 12.6 s, sys: 874 ms, total: 13.5 s
Wall time: 13.7 s


In [18]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:04:25.911692
