In [1]:
import os
import json
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Dict, List, Set, Tuple, NamedTuple, Callable, Any
import textstat
import scipy
import scml
from scml import pandasx as pdx
from daigt.preprocess import en as pen
from daigt.features import fit_tokenizer
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
df = pd.read_parquet("input/white.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221612 entries, 0 to 221611
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   row_id          221612 non-null  int32  
 1   essay_id        221612 non-null  object 
 2   generated       221612 non-null  int8   
 3   source          221612 non-null  object 
 4   prompt          221612 non-null  object 
 5   text            221612 non-null  object 
 6   text_bsc        221612 non-null  object 
 7   text_bow        221612 non-null  object 
 8   text_bow_len    221612 non-null  int16  
 9   prompt_bsc      221612 non-null  object 
 10  prompt_bow      221612 non-null  object 
 11  prompt_bow_len  221612 non-null  int16  
 12  white_sim       221612 non-null  float32
dtypes: float32(1), int16(2), int32(1), int8(1), object(8)
memory usage: 16.3+ MB


# Character level features

In [3]:
%%time
text_col = "text_bsc"
col = "ch_len"
df[col] = df[text_col].str.len()
df[col] = df[col].astype(np.int32)

def digit_frac(row) -> float:
    return pen.digit_frac(row[text_col])


def letter_frac(row) -> float:
    return pen.letter_frac(row[text_col])


def space_frac(row) -> float:
    return pen.space_frac(row[text_col])


def punc_frac(row) -> float:
    return pen.punc_frac(row[text_col])


def upper_frac(row) -> float:
    return pen.upper_frac(row[text_col])


def repeat_char_frac(row) -> float:
    return pen.repeat_char_frac(row[text_col])


def repeat_substring_frac(row) -> float:
    return pen.repeat_substring_frac(row[text_col])


char_fns: Dict[str, Callable] = {
    "ch_digit_frac": digit_frac,
    "ch_letter_frac": letter_frac,
    "ch_space_frac": space_frac,
    "ch_punc_frac": punc_frac,
    "ch_upper_frac": upper_frac,
    "ch_repeat_char_frac": repeat_char_frac,
    #"ch_repeat_substring_frac": repeat_substring_frac,
}

for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ch_digit_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:25<00:00, 8727.65it/s]


ch_letter_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:28<00:00, 7722.66it/s]


ch_space_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:26<00:00, 8412.33it/s]


ch_punc_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:26<00:00, 8302.85it/s]


ch_upper_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:25<00:00, 8532.49it/s]


ch_repeat_char_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:20<00:00, 2743.91it/s]

CPU times: user 3min 32s, sys: 991 ms, total: 3min 33s
Wall time: 3min 34s





# Word and Sentence Features

In [4]:
text_col = "text_bsc"

def sent_len_mean(row) -> float:
    return pen.sentence_length_mean(row[text_col])


def sent_len_std(row) -> float:
    return pen.sentence_length_std(row[text_col])


def sent_len_delta_mean(row) -> float:
    return pen.sentence_length_delta_mean(row[text_col])


def sent_len_delta_std(row) -> float:
    return pen.sentence_length_delta_std(row[text_col])


fmap: Dict[str, Callable] = {
    "ws_sent_len_mean": sent_len_mean,
    "ws_sent_len_std": sent_len_std,
    "ws_sent_len_delta_mean": sent_len_delta_mean,
    "ws_sent_len_delta_std": sent_len_delta_std,
}
for col, fn in fmap.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ws_sent_len_mean


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:48<00:00, 4530.26it/s]


ws_sent_len_std


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:52<00:00, 4221.23it/s]


ws_sent_len_delta_mean


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:49<00:00, 4450.18it/s]


ws_sent_len_delta_std


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:52<00:00, 4226.02it/s]


# Textstat Features

In [5]:
text_col = "text_bsc"

def syllable_count(row) -> int:
    return textstat.syllable_count(row[text_col])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row[text_col])


def sentence_count(row) -> int:
    return textstat.sentence_count(row[text_col])


def syllables_per_word(row) -> float:
    return row["ts_syllable_count"] / (row["ts_lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["ts_syllable_count"] / (row["ts_sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["ts_lexicon_count"] / (row["ts_sentence_count"] + 1)
    

def polysyllable_frac(row) -> float:
    return textstat.polysyllabcount(row[text_col]) / (row["ts_lexicon_count"] + 1)


def monosyllable_frac(row) -> float:
    return textstat.monosyllabcount(row[text_col]) / (row["ts_lexicon_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row[text_col])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row[text_col])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row[text_col])


def smog_index(row) -> float:
    return textstat.smog_index(row[text_col])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row[text_col])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row[text_col])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row[text_col])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row[text_col])


def difficult_words(row) -> float:
    return textstat.difficult_words(row[text_col])


def spache_readability(row) -> float:
    return textstat.spache_readability(row[text_col])


def mcalpine_eflaw(row) -> float:
    return textstat.mcalpine_eflaw(row[text_col])


stage1: List[Tuple[str, Callable, Any]] = [
    ("ts_syllable_count", syllable_count, np.int32),
    ("ts_lexicon_count", lexicon_count, np.int32),
    ("ts_sentence_count", sentence_count, np.int32),
]
stage2: List[Tuple[str, Callable, Any]] = [
    ("ts_syllables_per_word", syllables_per_word, np.float32),
    ("ts_syllables_per_sent", syllables_per_sent, np.float32),
    ("ts_words_per_sent", words_per_sent, np.float32),
    ("ts_polysyllable_frac", polysyllable_frac, np.float32),
    ("ts_monosyllable_frac", monosyllable_frac, np.float32),
    ("ts_flesch_reading_ease", flesch_reading_ease, np.float32),
    ("ts_flesch_kincaid_grade", flesch_kincaid_grade, np.float32),
    ("ts_gunning_fog", gunning_fog, np.float32),
    ("ts_smog_index", smog_index, np.float32),
    ("ts_automated_readability_index", automated_readability_index, np.float32),
    ("ts_coleman_liau_index", coleman_liau_index, np.float32),
    ("ts_linsear_write_formula", linsear_write_formula, np.float32),
    ("ts_dale_chall_readability_score", dale_chall_readability_score, np.float32),
    ("ts_difficult_words", difficult_words, np.float32),
    ("ts_spache_readability", spache_readability, np.float32),
    ("ts_mcalpine_eflaw", mcalpine_eflaw, np.float32),
]
for col, fn, dtype in stage1:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)
for col, fn, dtype in stage2:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)

ts_syllable_count


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:37<00:00, 2278.43it/s]


ts_lexicon_count


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:16<00:00, 13280.79it/s]


ts_sentence_count


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:32<00:00, 6772.49it/s]


ts_syllables_per_word


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:00<00:00, 371275.29it/s]


ts_syllables_per_sent


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:00<00:00, 369826.00it/s]


ts_words_per_sent


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:00<00:00, 365511.04it/s]


ts_polysyllable_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:06<00:00, 1747.41it/s]


ts_monosyllable_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:07<00:00, 1737.70it/s]


ts_flesch_reading_ease


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:20<00:00, 1580.37it/s]


ts_flesch_kincaid_grade


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:19<00:00, 1589.34it/s]


ts_gunning_fog


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:36<00:00, 1416.79it/s]


ts_smog_index


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:39<00:00, 1389.42it/s]


ts_automated_readability_index


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:07<00:00, 3305.20it/s]


ts_coleman_liau_index


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:20<00:00, 2749.29it/s]


ts_linsear_write_formula


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:23<00:00, 9347.10it/s]


ts_dale_chall_readability_score


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:35<00:00, 1421.44it/s]


ts_difficult_words


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:41<00:00, 2189.96it/s]


ts_spache_readability


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:34<00:00, 1434.42it/s]


ts_mcalpine_eflaw


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:01<00:00, 3627.40it/s]


# VAD Features

In [6]:
vdf = pd.read_csv("input/NRC-VAD-Lexicon/NRC-VAD-Lexicon.txt", header=0, names=["term", "valence", "arousal", "dominance"], 
                 sep="\t", engine="c", low_memory=False)
cols = ["valence", "arousal", "dominance"]
vdf[cols] = vdf[cols].astype(np.float32)
vdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19970 entries, 0 to 19969
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   term       19969 non-null  object 
 1   valence    19970 non-null  float32
 2   arousal    19970 non-null  float32
 3   dominance  19970 non-null  float32
dtypes: float32(3), object(1)
memory usage: 390.2+ KB


In [7]:
vad: Dict[str, Tuple] = {}
for t in tqdm(vdf.itertuples()):
    v = getattr(t, "valence")
    a = getattr(t, "arousal")
    d = getattr(t, "dominance")
    t = str(getattr(t, "term")).strip().lower()
    vad[t] = (v, a, d)

19970it [00:00, 1557837.54it/s]


In [8]:
cols = ["va_valence_mean", "va_valence_std", "va_arousal_mean", "va_arousal_std", "va_dominance_mean", "va_dominance_std"]
rows = []
for text in tqdm(df["text_bow"]):
    vas, ars, dos = [], [], []
    tokens = text.split()
    for t in tokens:
        if t in vad:
            v, a, d = vad[t]
            vas.append(v)
            ars.append(a)
            dos.append(d)
    if len(vas) == 0:
        vas = [-1]
    if len(ars) == 0:
        ars = [-1]
    if len(dos)==0:
        dos = [-1]
    rows.append([np.mean(vas), np.std(vas), np.mean(ars), np.std(ars), np.mean(dos), np.std(dos)])
df[cols] = rows
df[cols] = df[cols].astype(np.float32)
del rows, vdf, vad
gc.collect()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:41<00:00, 5390.31it/s]


32

# TF-IDF Features

In [9]:
%%time
corpus = df["text_bow"].tolist()
tokenizer = fit_tokenizer(corpus=corpus)
dp = "output/tokenizer"
Path(dp).mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(dp)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")




PreTrainedTokenizerFast(name_or_path='', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']
CPU times: user 13min 30s, sys: 2.54 s, total: 

In [10]:
%%time
tokenized = [tokenizer.tokenize(x) for x in corpus]
print(tokenized[0])

['Ġcars', 'Ġcars', 'Ġhave', 'Ġbeen', 'Ġaround', 'Ġsince', 'Ġthey', 'Ġbecame', 'Ġfamous', 'Ġin', 'Ġthe', 'Ġ1900', 's', 'Ġwhen', 'Ġhenry', 'Ġford', 'Ġcreated', 'Ġand', 'Ġbuilt', 'Ġthe', 'Ġfirst', 'Ġmodel', 't', 'Ġcars', 'Ġhave', 'Ġplayed', 'Ġa', 'Ġmajor', 'Ġrole', 'Ġin', 'Ġour', 'Ġevery', 'Ġday', 'Ġlives', 'Ġsince', 'Ġthen', 'Ġbut', 'Ġnow', 'Ġpeople', 'Ġare', 'Ġstarting', 'Ġto', 'Ġquestion', 'Ġif', 'Ġlimiting', 'Ġcar', 'Ġusage', 'Ġwould', 'Ġbe', 'Ġa', 'Ġgood', 'Ġthing', 'Ġto', 'Ġme', 'Ġlimiting', 'Ġthe', 'Ġuse', 'Ġof', 'Ġcars', 'Ġmight', 'Ġbe', 'Ġa', 'Ġgood', 'Ġthing', 'Ġto', 'Ġdo', 'Ġin', 'Ġlike', 'Ġmatter', 'Ġof', 'Ġthis', 'Ġarticle', 'Ġin', 'Ġgerman', 'Ġsuburb', 'Ġlife', 'Ġgoes', 'Ġon', 'Ġwithout', 'Ġcars', 'Ġby', 'Ġelizabeth', 'Ġrosenthal', 'Ġstates', 'Ġhow', 'Ġautomobiles', 'Ġare', 'Ġthe', 'Ġlin', 'ch', 'pin', 'Ġof', 'Ġsuburbs', 'Ġwhere', 'Ġmiddle', 'Ġclass', 'Ġfamilies', 'Ġfrom', 'Ġeither', 'Ġshanghai', 'Ġor', 'Ġchicago', 'Ġtend', 'Ġto', 'Ġmake', 'Ġtheir', 'Ġhomes', 'Ġexperts', 'Ġs

In [11]:
%%time
vectorizer = TfidfVectorizer(
    ngram_range=(1,1), 
    lowercase=False, 
    sublinear_tf=True, 
    analyzer = 'word',
    tokenizer = lambda x: x,
    preprocessor = lambda x: x,
    token_pattern =None, 
    strip_accents=None
)
x = vectorizer.fit_transform(tokenized)
print(f"x={x.shape}")

x=(221612, 30457)
CPU times: user 24.4 s, sys: 548 ms, total: 25 s
Wall time: 25 s


In [12]:
%%time
cols = [f"tf_{f}" for f in vectorizer.get_feature_names_out()]
df[cols] = scipy.sparse.csr_matrix(x).todense().astype(np.float32)
df = df.copy()  # defragment

CPU times: user 1min 19s, sys: 30.6 s, total: 1min 49s
Wall time: 2min 6s


In [13]:
vocab = vectorizer.vocabulary_
print(f"{len(vocab):,} vocab")
idf = vectorizer.idf_
print(f"idf={idf.shape}")
with open("output/tfidf.json", "w") as f:
    json.dump({"vocabulary": vocab, "idf": idf.tolist()}, f)

30,457 vocab
idf=(30457,)


In [14]:
del vectorizer, corpus, vocab, idf
gc.collect()

32

# Review Data

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221612 entries, 0 to 221611
Columns: 30506 entries, row_id to tf_Ġzyg
dtypes: float32(30490), int16(2), int32(5), int8(1), object(8)
memory usage: 25.2+ GB


In [16]:
features = []
prefixes = ["ch_", "ws_", "ts_", "va_", "tf_Ġhe"]
for col in df.columns:
    for prefix in prefixes:
        if col.startswith(prefix):
            features.append(col)
features.sort()
df[features].describe(percentiles=percentiles)

Unnamed: 0,ch_digit_frac,ch_len,ch_letter_frac,ch_punc_frac,ch_repeat_char_frac,ch_space_frac,ch_upper_frac,tf_Ġhe,tf_Ġhead,tf_Ġheadache,tf_Ġheadaches,tf_Ġheaded,tf_Ġheading,tf_Ġheadlines,tf_Ġheadphones,tf_Ġheadquart,tf_Ġheadquartered,tf_Ġheadquarters,tf_Ġheads,tf_Ġheal,tf_Ġhealed,tf_Ġhealing,tf_Ġhealth,tf_Ġhealthcare,tf_Ġhealthier,tf_Ġhealthy,tf_Ġhealy,tf_Ġhear,tf_Ġheard,tf_Ġhearing,tf_Ġhearings,tf_Ġhears,tf_Ġheart,tf_Ġheartbeat,tf_Ġhearted,tf_Ġhearts,tf_Ġheat,tf_Ġheated,tf_Ġheath,tf_Ġheather,tf_Ġheating,tf_Ġheav,tf_Ġheaven,tf_Ġheavenly,tf_Ġheavens,tf_Ġheavier,tf_Ġheavily,tf_Ġheavy,tf_Ġheb,tf_Ġhebrew,tf_Ġhebrews,tf_Ġheck,tf_Ġhect,tf_Ġhectares,tf_Ġhectic,tf_Ġhector,tf_Ġhed,tf_Ġhedge,tf_Ġhedging,tf_Ġheed,tf_Ġheels,tf_Ġhefty,tf_Ġhege,tf_Ġhegel,tf_Ġhegemon,tf_Ġhegemonic,tf_Ġhegemony,tf_Ġheid,tf_Ġheide,tf_Ġheidegger,tf_Ġheidel,tf_Ġheidelberg,tf_Ġheidrun,tf_Ġheight,tf_Ġheighten,tf_Ġheightened,tf_Ġheights,tf_Ġheil,tf_Ġhein,tf_Ġheine,tf_Ġheineken,tf_Ġheinem,tf_Ġheinemann,tf_Ġheinous,tf_Ġheinz,tf_Ġheir,tf_Ġhel,tf_Ġheld,tf_Ġhelen,tf_Ġhelic,tf_Ġhelicop,tf_Ġhelicopter,tf_Ġhelicopters,tf_Ġhelium,tf_Ġhell,tf_Ġhellen,tf_Ġhellenistic,tf_Ġheller,tf_Ġhello,tf_Ġhelm,tf_Ġhelmet,tf_Ġhelp,tf_Ġhelped,tf_Ġhelper,tf_Ġhelpful,tf_Ġhelping,tf_Ġhelpless,tf_Ġhelplessness,tf_Ġhelps,tf_Ġhem,tf_Ġhemat,tf_Ġheming,tf_Ġhemingway,tf_Ġhemis,tf_Ġhemisphere,tf_Ġhemod,tf_Ġhemodialysis,tf_Ġhemoglobin,tf_Ġhemorrh,tf_Ġhemorrhage,tf_Ġhemp,tf_Ġhen,tf_Ġhence,tf_Ġhenceforth,tf_Ġhend,tf_Ġhenderson,tf_Ġhendric,tf_Ġhenn,tf_Ġhenri,tf_Ġhenry,tf_Ġhens,tf_Ġhep,tf_Ġhepat,tf_Ġhepatitis,tf_Ġher,tf_Ġherald,tf_Ġherb,tf_Ġherbal,tf_Ġherbert,tf_Ġherbs,tf_Ġherd,tf_Ġhere,tf_Ġhered,tf_Ġhereditary,tf_Ġheredity,tf_Ġherein,tf_Ġherit,tf_Ġheritage,tf_Ġherm,tf_Ġherman,tf_Ġhermene,tf_Ġhern,tf_Ġhernand,tf_Ġhernandez,tf_Ġhero,tf_Ġherod,tf_Ġheroes,tf_Ġheroic,tf_Ġheroin,tf_Ġheroine,tf_Ġheroism,tf_Ġhers,tf_Ġherself,tf_Ġhershey,tf_Ġherz,tf_Ġherzberg,tf_Ġhes,tf_Ġhesit,tf_Ġhesitant,tf_Ġhesitate,tf_Ġhesitation,tf_Ġhess,tf_Ġhester,tf_Ġheter,tf_Ġheterogeneity,tf_Ġheterogeneous,tf_Ġheterosexual,tf_Ġheur,tf_Ġheuristic,tf_Ġheuristics,tf_Ġhew,tf_Ġhewitt,tf_Ġhewlett,tf_Ġhex,tf_Ġhey,tf_Ġhezb,tf_Ġhezbollah,ts_automated_readability_index,ts_coleman_liau_index,ts_dale_chall_readability_score,ts_difficult_words,ts_flesch_kincaid_grade,ts_flesch_reading_ease,ts_gunning_fog,ts_lexicon_count,ts_linsear_write_formula,ts_mcalpine_eflaw,ts_monosyllable_frac,ts_polysyllable_frac,ts_sentence_count,ts_smog_index,ts_spache_readability,ts_syllable_count,ts_syllables_per_sent,ts_syllables_per_word,ts_words_per_sent,va_arousal_mean,va_arousal_std,va_dominance_mean,va_dominance_std,va_valence_mean,va_valence_std,ws_sent_len_delta_mean,ws_sent_len_delta_std,ws_sent_len_mean,ws_sent_len_std
count,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0
mean,0.007001,5965.726445,0.802702,0.025395,0.016239,0.164901,0.021519,0.012499,0.002527,0.000217,0.000192,0.000498,0.000291,0.000122,0.000105,5.1e-05,9.9e-05,0.000412,0.000689,0.000325,9.7e-05,0.000509,0.008158,0.003063,0.000993,0.002993,7.4e-05,0.00161,0.001703,0.001011,0.000106,0.000176,0.002728,0.000121,0.000211,0.000442,0.001016,0.000372,0.000299,9.5e-05,0.000332,7.9e-05,0.000455,0.00013,8.4e-05,0.000147,0.001249,0.001253,5.6e-05,0.00011,4.4e-05,0.000121,2.7e-05,7.1e-05,0.000209,5.8e-05,0.000218,0.00013,9.8e-05,0.00012,8.9e-05,0.000126,9e-06,7.7e-05,2e-05,9.1e-05,0.000153,9.1e-05,1.5e-05,4.5e-05,3e-06,5.6e-05,0.000236,0.000604,0.000102,0.000289,0.000378,6.4e-05,0.000125,1.9e-05,2.8e-05,1.5e-05,0.000282,0.000103,5.6e-05,0.000111,0.000384,0.002293,0.000181,1.7e-05,2e-06,7.4e-05,5.8e-05,5.8e-05,0.000505,2.6e-05,4.8e-05,6.9e-05,0.000592,0.000184,6.7e-05,0.013667,0.003362,0.000105,0.002598,0.003545,0.000305,0.000154,0.005218,0.000171,6.9e-05,2e-06,0.000126,1.3e-05,0.000176,2e-06,2.6e-05,7e-05,3.9e-05,5.1e-05,5.7e-05,0.000275,0.00433,8.7e-05,0.000104,0.000188,5.8e-05,5.8e-05,0.000153,0.00063,8.1e-05,0.000113,4e-05,0.000124,0.009192,0.000171,0.000169,0.000112,0.000169,0.000111,0.000114,0.004719,1.6e-05,0.000217,0.000104,0.000178,8.5e-05,0.000914,6.7e-05,0.000151,4.8e-05,5.2e-05,1.6e-05,0.000107,0.000727,3.7e-05,0.000464,0.000237,0.000172,0.00017,0.000147,0.000201,0.001217,0.000123,8e-05,7.3e-05,0.000214,0.000131,0.000257,0.000282,0.000196,0.000195,6e-05,0.000124,0.000116,0.000162,0.000248,3e-06,5.8e-05,4.8e-05,5e-05,5.4e-05,0.000101,5.5e-05,0.001188,5.43232e-07,5.6e-05,12.585382,11.386264,8.19298,159.381882,10.705375,53.626904,11.441067,950.561526,12.163027,27.494675,0.645215,0.152362,50.153976,12.730854,5.110836,1543.855085,29.810389,1.570054,18.975353,0.448143,0.153978,0.580518,0.164823,0.631729,0.182594,9.789777,8.315752,19.064102,10.486661
std,0.00985,6241.571249,0.017911,0.009563,0.005521,0.013395,0.013531,0.022851,0.011352,0.004215,0.003922,0.005328,0.004579,0.003255,0.003558,0.001954,0.002396,0.004816,0.006511,0.004936,0.002919,0.006173,0.020895,0.015831,0.010182,0.014085,0.002915,0.010679,0.010409,0.008887,0.00283,0.003707,0.01214,0.003185,0.004221,0.005356,0.008848,0.005127,0.004982,0.002941,0.005914,0.002581,0.005866,0.003146,0.002542,0.003417,0.007728,0.007911,0.002642,0.003299,0.002273,0.003643,0.001532,0.003138,0.00442,0.00277,0.004504,0.003501,0.003348,0.002955,0.002907,0.003009,0.00096,0.003204,0.001337,0.002795,0.003733,0.00304,0.001302,0.002927,0.000548,0.001901,0.004665,0.006193,0.002786,0.004051,0.005068,0.002841,0.003417,0.001498,0.00247,0.00137,0.003797,0.002717,0.002609,0.002874,0.005647,0.009764,0.00406,0.001409,0.000652,0.002815,0.002186,0.002424,0.006266,0.001665,0.002257,0.002777,0.008113,0.00358,0.002757,0.021709,0.012971,0.003334,0.0133,0.015429,0.004545,0.003406,0.015743,0.003849,0.002346,0.000403,0.00486,0.000974,0.004323,0.000341,0.001847,0.002535,0.001772,0.002089,0.00236,0.004911,0.012558,0.002638,0.003376,0.004466,0.00265,0.002531,0.003931,0.006955,0.003188,0.00335,0.001698,0.003706,0.021617,0.003302,0.003948,0.003536,0.003721,0.003031,0.003506,0.015327,0.001404,0.003921,0.003087,0.003154,0.002871,0.007914,0.002627,0.003888,0.002186,0.002341,0.001577,0.003466,0.007991,0.002193,0.006144,0.004321,0.004423,0.004067,0.00381,0.004009,0.008226,0.003254,0.00284,0.002999,0.004746,0.003264,0.004698,0.004879,0.004054,0.004605,0.003238,0.003279,0.002821,0.003176,0.004995,0.000476,0.002452,0.002485,0.002414,0.002593,0.003294,0.002185,0.010856,0.0002557301,0.00285,5.368936,2.752176,1.085742,140.554245,4.290843,16.841059,4.195405,962.11098,5.047263,14.684708,0.075219,0.057724,50.310293,2.257807,1.467629,1628.733001,9.569697,0.16387,6.549414,0.029354,0.019396,0.038843,0.014807,0.043931,0.030532,7.678156,5.760149,10.566523,6.125455
min,0.0,767.0,0.387132,0.0,0.0,0.113198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.3,0.56,0.74,2.0,0.6,-1254.800049,2.7,106.0,0.818182,6.6,0.367925,0.0,1.0,0.0,2.07,177.0,6.481818,1.044855,5.165803,0.273268,0.070332,0.364898,0.074764,0.383705,0.086386,0.0,0.0,3.364583,0.0
1%,0.0,876.11,0.747467,0.010554,0.007808,0.140908,0.006141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.7,4.7,5.81,14.0,4.2,22.549999,6.08,151.0,4.777778,15.6,0.499614,0.03213,7.0,7.2,3.24,220.0,14.674299,1.209363,10.764706,0.374117,0.113492,0.490502,0.128913,0.511928,0.123592,4.151587,2.871617,10.2,3.971511
5%,0.0,1113.0,0.773763,0.014257,0.010052,0.146473,0.008511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,6.55,6.41,24.0,6.1,31.309999,7.75,194.0,6.0,18.6,0.537269,0.05483,10.0,8.8,3.85,280.0,18.76,1.286335,13.0,0.400779,0.125767,0.515888,0.141038,0.55309,0.138389,5.266667,3.901057,12.233572,5.184129
10%,0.0,1396.0,0.782778,0.016114,0.011198,0.149538,0.010172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.4,7.54,6.86,32.0,7.2,35.57,8.61,244.0,6.857143,20.200001,0.556373,0.072089,13.0,9.7,4.16,351.0,21.078207,1.340996,14.176471,0.412587,0.131816,0.529456,0.146807,0.574162,0.146159,5.89567,4.499135,13.345454,5.992673
20%,0.0,1904.0,0.791171,0.018487,0.012605,0.153454,0.013261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,8.94,7.41,47.0,8.5,41.5,9.59,321.0,7.875,22.200001,0.580328,0.097923,17.0,10.8,4.48,482.0,23.885714,1.419355,15.583333,0.425564,0.138607,0.546735,0.15326,0.597298,0.15601,6.732394,5.292834,14.769231,7.124855
30%,0.001003,2371.0,0.796525,0.020397,0.013655,0.156561,0.015954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,10.03,7.73,65.0,9.4,45.150002,10.23,400.0,8.833333,23.6,0.599364,0.119792,21.0,11.7,4.7,604.0,25.857143,1.482116,16.594595,0.434135,0.143467,0.559747,0.157616,0.612412,0.163921,7.416667,5.945591,15.904762,8.041165
40%,0.002517,3173.0,0.800752,0.022169,0.014609,0.159581,0.01827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.9,10.91,7.99,89.0,10.1,49.080002,10.77,548.0,10.666667,24.9,0.616948,0.13881,27.0,12.4,4.88,805.0,27.571428,1.535714,17.481823,0.441382,0.147763,0.570885,0.161264,0.624457,0.171503,8.061919,6.566246,16.944445,8.872805


In [17]:
%%time
df.to_parquet(f"output/features.parquet", index=False)
assert df.notna().all(axis=None)

CPU times: user 1min 7s, sys: 6.32 s, total: 1min 13s
Wall time: 1min 16s


In [18]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:56:59.878683
