In [1]:
import os
import json
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import Dict, List, Set, Tuple, NamedTuple, Callable, Any
import textstat
import scipy
import scml
from scml import pandasx as pdx
from daigt.preprocess import en as pen
from daigt.features import fit_tokenizer
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.finfo(np.float16)
print(f"float16, min={info.min}, max={info.max}")

float16, min=-65504.0, max=65504.0


In [2]:
df = pd.read_parquet("input/white.parquet")
df = df.drop(columns=["text", "prompt"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221612 entries, 0 to 221611
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   row_id          221612 non-null  int32  
 1   essay_id        221612 non-null  object 
 2   generated       221612 non-null  int8   
 3   source          221612 non-null  object 
 4   text_bsc        221612 non-null  object 
 5   text_bow        221612 non-null  object 
 6   text_bow_len    221612 non-null  int16  
 7   prompt_bsc      221612 non-null  object 
 8   prompt_bow      221612 non-null  object 
 9   prompt_bow_len  221612 non-null  int16  
 10  white_sim       221612 non-null  float32
dtypes: float32(1), int16(2), int32(1), int8(1), object(6)
memory usage: 12.9+ MB


# Character level features

In [3]:
%%time
text_col = "text_bsc"
col = "ch_len"
df[col] = df[text_col].str.len()
df[col] = df[col].astype(np.int32)

def digit_frac(row) -> float:
    return pen.digit_frac(row[text_col])


def letter_frac(row) -> float:
    return pen.letter_frac(row[text_col])


def space_frac(row) -> float:
    return pen.space_frac(row[text_col])


def punc_frac(row) -> float:
    return pen.punc_frac(row[text_col])


def upper_frac(row) -> float:
    return pen.upper_frac(row[text_col])


def repeat_char_frac(row) -> float:
    return pen.repeat_char_frac(row[text_col])


def repeat_substring_frac(row) -> float:
    return pen.repeat_substring_frac(row[text_col])


char_fns: Dict[str, Callable] = {
    "ch_digit_frac": digit_frac,
    "ch_letter_frac": letter_frac,
    "ch_space_frac": space_frac,
    "ch_punc_frac": punc_frac,
    "ch_upper_frac": upper_frac,
    "ch_repeat_char_frac": repeat_char_frac,
    #"ch_repeat_substring_frac": repeat_substring_frac,
}

for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ch_digit_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:25<00:00, 8693.03it/s]


ch_letter_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:28<00:00, 7839.84it/s]


ch_space_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:26<00:00, 8412.11it/s]


ch_punc_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:26<00:00, 8349.49it/s]


ch_upper_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:25<00:00, 8577.45it/s]


ch_repeat_char_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:20<00:00, 2768.32it/s]

CPU times: user 3min 31s, sys: 724 ms, total: 3min 32s
Wall time: 3min 32s





# Word and Sentence Features

In [4]:
text_col = "text_bsc"

def sent_len_mean(row) -> float:
    return pen.sentence_length_mean(row[text_col])


def sent_len_std(row) -> float:
    return pen.sentence_length_std(row[text_col])


def sent_len_delta_mean(row) -> float:
    return pen.sentence_length_delta_mean(row[text_col])


def sent_len_delta_std(row) -> float:
    return pen.sentence_length_delta_std(row[text_col])


fmap: Dict[str, Callable] = {
    "ws_sent_len_mean": sent_len_mean,
    "ws_sent_len_std": sent_len_std,
    "ws_sent_len_delta_mean": sent_len_delta_mean,
    "ws_sent_len_delta_std": sent_len_delta_std,
}
for col, fn in fmap.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ws_sent_len_mean


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:47<00:00, 4620.25it/s]


ws_sent_len_std


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:50<00:00, 4352.09it/s]


ws_sent_len_delta_mean


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:49<00:00, 4519.06it/s]


ws_sent_len_delta_std


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:51<00:00, 4307.25it/s]


# Textstat Features

In [5]:
text_col = "text_bsc"

def syllable_count(row) -> int:
    return textstat.syllable_count(row[text_col])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row[text_col])


def sentence_count(row) -> int:
    return textstat.sentence_count(row[text_col])


def syllables_per_word(row) -> float:
    return row["ts_syllable_count"] / (row["ts_lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["ts_syllable_count"] / (row["ts_sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["ts_lexicon_count"] / (row["ts_sentence_count"] + 1)
    

def polysyllable_frac(row) -> float:
    return textstat.polysyllabcount(row[text_col]) / (row["ts_lexicon_count"] + 1)


def monosyllable_frac(row) -> float:
    return textstat.monosyllabcount(row[text_col]) / (row["ts_lexicon_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row[text_col])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row[text_col])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row[text_col])


def smog_index(row) -> float:
    return textstat.smog_index(row[text_col])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row[text_col])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row[text_col])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row[text_col])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row[text_col])


def difficult_words(row) -> float:
    return textstat.difficult_words(row[text_col])


def spache_readability(row) -> float:
    return textstat.spache_readability(row[text_col])


def mcalpine_eflaw(row) -> float:
    return textstat.mcalpine_eflaw(row[text_col])


stage1: List[Tuple[str, Callable, Any]] = [
    ("ts_syllable_count", syllable_count, np.int32),
    ("ts_lexicon_count", lexicon_count, np.int32),
    ("ts_sentence_count", sentence_count, np.int32),
]
stage2: List[Tuple[str, Callable, Any]] = [
    ("ts_syllables_per_word", syllables_per_word, np.float32),
    ("ts_syllables_per_sent", syllables_per_sent, np.float32),
    ("ts_words_per_sent", words_per_sent, np.float32),
    ("ts_polysyllable_frac", polysyllable_frac, np.float32),
    ("ts_monosyllable_frac", monosyllable_frac, np.float32),
    ("ts_flesch_reading_ease", flesch_reading_ease, np.float32),
    ("ts_flesch_kincaid_grade", flesch_kincaid_grade, np.float32),
    ("ts_gunning_fog", gunning_fog, np.float32),
    ("ts_smog_index", smog_index, np.float32),
    ("ts_automated_readability_index", automated_readability_index, np.float32),
    ("ts_coleman_liau_index", coleman_liau_index, np.float32),
    ("ts_linsear_write_formula", linsear_write_formula, np.float32),
    ("ts_dale_chall_readability_score", dale_chall_readability_score, np.float32),
    ("ts_difficult_words", difficult_words, np.float32),
    ("ts_spache_readability", spache_readability, np.float32),
    ("ts_mcalpine_eflaw", mcalpine_eflaw, np.float32),
]
for col, fn, dtype in stage1:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)
for col, fn, dtype in stage2:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(dtype)

ts_syllable_count


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:34<00:00, 2357.18it/s]


ts_lexicon_count


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:16<00:00, 13441.01it/s]


ts_sentence_count


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:32<00:00, 6821.46it/s]


ts_syllables_per_word


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:00<00:00, 378380.86it/s]


ts_syllables_per_sent


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:00<00:00, 383058.41it/s]


ts_words_per_sent


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:00<00:00, 374574.19it/s]


ts_polysyllable_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:04<00:00, 1783.29it/s]


ts_monosyllable_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:03<00:00, 1801.31it/s]


ts_flesch_reading_ease


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:16<00:00, 1626.65it/s]


ts_flesch_kincaid_grade


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:17<00:00, 1614.18it/s]


ts_gunning_fog


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:34<00:00, 1430.93it/s]


ts_smog_index


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:40<00:00, 1379.61it/s]


ts_automated_readability_index


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:07<00:00, 3296.40it/s]


ts_coleman_liau_index


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:19<00:00, 2779.35it/s]


ts_linsear_write_formula


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:23<00:00, 9561.89it/s]


ts_dale_chall_readability_score


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:34<00:00, 1431.81it/s]


ts_difficult_words


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:47<00:00, 2059.54it/s]


ts_spache_readability


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [02:35<00:00, 1424.52it/s]


ts_mcalpine_eflaw


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [01:01<00:00, 3579.50it/s]


# VAD Features

In [6]:
vdf = pd.read_csv("input/NRC-VAD-Lexicon/NRC-VAD-Lexicon.txt", header=0, names=["term", "valence", "arousal", "dominance"], 
                 sep="\t", engine="c", low_memory=False)
cols = ["valence", "arousal", "dominance"]
vdf[cols] = vdf[cols].astype(np.float32)
vdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19970 entries, 0 to 19969
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   term       19969 non-null  object 
 1   valence    19970 non-null  float32
 2   arousal    19970 non-null  float32
 3   dominance  19970 non-null  float32
dtypes: float32(3), object(1)
memory usage: 390.2+ KB


In [7]:
vad: Dict[str, Tuple] = {}
for t in tqdm(vdf.itertuples()):
    v = getattr(t, "valence")
    a = getattr(t, "arousal")
    d = getattr(t, "dominance")
    t = str(getattr(t, "term")).strip().lower()
    vad[t] = (v, a, d)

19970it [00:00, 1370333.27it/s]


In [8]:
cols = ["va_valence_mean", "va_valence_std", "va_arousal_mean", "va_arousal_std", "va_dominance_mean", "va_dominance_std"]
rows = []
for text in tqdm(df["text_bow"]):
    vas, ars, dos = [], [], []
    tokens = text.split()
    for t in tokens:
        if t in vad:
            v, a, d = vad[t]
            vas.append(v)
            ars.append(a)
            dos.append(d)
    if len(vas) == 0:
        vas = [-1]
    if len(ars) == 0:
        ars = [-1]
    if len(dos)==0:
        dos = [-1]
    rows.append([np.mean(vas), np.std(vas), np.mean(ars), np.std(ars), np.mean(dos), np.std(dos)])
df[cols] = rows
df[cols] = df[cols].astype(np.float32)
del rows, vdf, vad
gc.collect()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 221612/221612 [00:45<00:00, 4878.50it/s]


32

# TF-IDF Features

In [9]:
%%time
corpus = df["text_bow"].tolist()
tokenizer = fit_tokenizer(corpus=corpus)
dp = "output/tokenizer"
Path(dp).mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(dp)
print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")




PreTrainedTokenizerFast(name_or_path='', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
model_input_names=['input_ids', 'token_type_ids', 'attention_mask']
CPU times: user 13min 22s, sys: 4.42 s, total: 

In [10]:
%%time
tokenized = [tokenizer.tokenize(x) for x in corpus]
print(tokenized[0])

['Ġcars', 'Ġcars', 'Ġhave', 'Ġbeen', 'Ġaround', 'Ġsince', 'Ġthey', 'Ġbecame', 'Ġfamous', 'Ġin', 'Ġthe', 'Ġ1900', 's', 'Ġwhen', 'Ġhenry', 'Ġford', 'Ġcreated', 'Ġand', 'Ġbuilt', 'Ġthe', 'Ġfirst', 'Ġmodel', 't', 'Ġcars', 'Ġhave', 'Ġplayed', 'Ġa', 'Ġmajor', 'Ġrole', 'Ġin', 'Ġour', 'Ġevery', 'Ġday', 'Ġlives', 'Ġsince', 'Ġthen', 'Ġbut', 'Ġnow', 'Ġpeople', 'Ġare', 'Ġstarting', 'Ġto', 'Ġquestion', 'Ġif', 'Ġlimiting', 'Ġcar', 'Ġusage', 'Ġwould', 'Ġbe', 'Ġa', 'Ġgood', 'Ġthing', 'Ġto', 'Ġme', 'Ġlimiting', 'Ġthe', 'Ġuse', 'Ġof', 'Ġcars', 'Ġmight', 'Ġbe', 'Ġa', 'Ġgood', 'Ġthing', 'Ġto', 'Ġdo', 'Ġin', 'Ġlike', 'Ġmatter', 'Ġof', 'Ġthis', 'Ġarticle', 'Ġin', 'Ġgerman', 'Ġsuburb', 'Ġlife', 'Ġgoes', 'Ġon', 'Ġwithout', 'Ġcars', 'Ġby', 'Ġelizabeth', 'Ġrosenthal', 'Ġstates', 'Ġhow', 'Ġautomobiles', 'Ġare', 'Ġthe', 'Ġlin', 'ch', 'pin', 'Ġof', 'Ġsuburbs', 'Ġwhere', 'Ġmiddle', 'Ġclass', 'Ġfamilies', 'Ġfrom', 'Ġeither', 'Ġshanghai', 'Ġor', 'Ġchicago', 'Ġtend', 'Ġto', 'Ġmake', 'Ġtheir', 'Ġhomes', 'Ġexperts', 'Ġs

In [11]:
%%time
vectorizer = TfidfVectorizer(
    ngram_range=(3, 4), 
    max_features=50_000,
    lowercase=False, 
    sublinear_tf=True, 
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None, 
    strip_accents=None,
    dtype=np.float32,  # saves memory! scipy.sparse does not support fp16
)
x = vectorizer.fit_transform(tokenized)
print(f"x={x.shape}")

x=(221612, 50000)
CPU times: user 17min 22s, sys: 23.8 s, total: 17min 46s
Wall time: 17min 51s


In [12]:
#del vectorizer,x
#gc.collect()

In [13]:
%%time
cols = ["_".join(["tf"] + f.split()) for f in vectorizer.get_feature_names_out()]
df[cols] = scipy.sparse.csr_matrix(x).todense().astype(np.float32)
df = df.copy()  # defragment

CPU times: user 3min 35s, sys: 51.4 s, total: 4min 26s
Wall time: 4min 54s


In [14]:
vocab = {k: int(v) for k,v in vectorizer.vocabulary_.items()}
print(f"{len(vocab):,} vocab")
idf = vectorizer.idf_
print(f"idf={idf.shape}")
with open("output/tfidf.json", "w") as f:
    json.dump({"vocabulary": vocab, "idf": idf.tolist()}, f)

50,000 vocab
idf=(50000,)


In [15]:
del vectorizer, x, corpus, vocab, idf, tokenizer, tokenized
gc.collect()

32

# Review Data

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221612 entries, 0 to 221611
Columns: 50047 entries, row_id to tf_Ġyoutube_Ġuploaded_Ġby
dtypes: float32(50033), int16(2), int32(5), int8(1), object(6)
memory usage: 41.3+ GB


In [17]:
features = []
prefixes = ["ch_", "ws_", "ts_", "va_", "tf_Ġhe"]
for col in df.columns:
    for prefix in prefixes:
        if col.startswith(prefix):
            features.append(col)
features.sort()
df[features].describe(percentiles=percentiles)

Unnamed: 0,ch_digit_frac,ch_len,ch_letter_frac,ch_punc_frac,ch_repeat_char_frac,ch_space_frac,ch_upper_frac,tf_Ġhe_Ġand_Ġhis,tf_Ġhe_Ġargued_Ġthat,tf_Ġhe_Ġargues_Ġthat,tf_Ġhe_Ġargues_Ġthat_Ġthe,tf_Ġhe_Ġasserts_Ġthat,tf_Ġhe_Ġbecame_Ġa,tf_Ġhe_Ġbelieved_Ġthat,tf_Ġhe_Ġbelieves_Ġthat,tf_Ġhe_Ġcame_Ġto,tf_Ġhe_Ġcan_Ġbe,tf_Ġhe_Ġclaimed_Ġthat,tf_Ġhe_Ġclaims_Ġthat,tf_Ġhe_Ġcould_Ġbe,tf_Ġhe_Ġcould_Ġhave,tf_Ġhe_Ġcould_Ġnot,tf_Ġhe_Ġdecided_Ġto,tf_Ġhe_Ġdecides_Ġto,tf_Ġhe_Ġdid_Ġnot,tf_Ġhe_Ġdid_Ġnot_Ġhave,tf_Ġhe_Ġdid_Ġnot_Ġwant,tf_Ġhe_Ġdoes_Ġnot,tf_Ġhe_Ġdoes_Ġnot_Ġhave,tf_Ġhe_Ġdoes_Ġnot_Ġwant,tf_Ġhe_Ġexplains_Ġthat,tf_Ġhe_Ġfailed_Ġto,tf_Ġhe_Ġfails_Ġto,tf_Ġhe_Ġhad_Ġa,tf_Ġhe_Ġhad_Ġbeen,tf_Ġhe_Ġhad_Ġno,tf_Ġhe_Ġhad_Ġnot,tf_Ġhe_Ġhad_Ġthe,tf_Ġhe_Ġhad_Ġto,tf_Ġhe_Ġhas_Ġa,tf_Ġhe_Ġhas_Ġalso,tf_Ġhe_Ġhas_Ġbeen,tf_Ġhe_Ġhas_Ġno,tf_Ġhe_Ġhas_Ġnot,tf_Ġhe_Ġhas_Ġthe,tf_Ġhe_Ġhas_Ġto,tf_Ġhe_Ġis_Ġa,tf_Ġhe_Ġis_Ġable,tf_Ġhe_Ġis_Ġable_Ġto,tf_Ġhe_Ġis_Ġalso,tf_Ġhe_Ġis_Ġan,tf_Ġhe_Ġis_Ġdoing,tf_Ġhe_Ġis_Ġgoing,tf_Ġhe_Ġis_Ġgoing_Ġto,tf_Ġhe_Ġis_Ġin,tf_Ġhe_Ġis_Ġnot,tf_Ġhe_Ġis_Ġstill,tf_Ġhe_Ġis_Ġthe,tf_Ġhe_Ġknew_Ġthat,tf_Ġhe_Ġmanaged_Ġto,tf_Ġhe_Ġneeds_Ġto,tf_Ġhe_Ġnotes_Ġthat,tf_Ġhe_Ġor_Ġshe,tf_Ġhe_Ġor_Ġshe_Ġcan,tf_Ġhe_Ġor_Ġshe_Ġdoes,tf_Ġhe_Ġor_Ġshe_Ġhas,tf_Ġhe_Ġor_Ġshe_Ġis,tf_Ġhe_Ġor_Ġshe_Ġmay,tf_Ġhe_Ġor_Ġshe_Ġmust,tf_Ġhe_Ġor_Ġshe_Ġshould,tf_Ġhe_Ġor_Ġshe_Ġwill,tf_Ġhe_Ġor_Ġshe_Ġwould,tf_Ġhe_Ġpoints_Ġout,tf_Ġhe_Ġpoints_Ġout_Ġthat,tf_Ġhe_Ġrealized_Ġthat,tf_Ġhe_Ġrealizes_Ġthat,tf_Ġhe_Ġrefers_Ġto,tf_Ġhe_Ġsaid_Ġthat,tf_Ġhe_Ġsays_Ġthat,tf_Ġhe_Ġseems_Ġto,tf_Ġhe_Ġshe_Ġcan,tf_Ġhe_Ġshe_Ġhas,tf_Ġhe_Ġshe_Ġis,tf_Ġhe_Ġshe_Ġshould,tf_Ġhe_Ġshe_Ġwill,tf_Ġhe_Ġshould_Ġbe,tf_Ġhe_Ġshould_Ġhave,tf_Ġhe_Ġstated_Ġthat,tf_Ġhe_Ġstates_Ġthat,tf_Ġhe_Ġsuggests_Ġthat,tf_Ġhe_Ġtold_Ġme,tf_Ġhe_Ġtried_Ġto,tf_Ġhe_Ġtries_Ġto,tf_Ġhe_Ġused_Ġto,tf_Ġhe_Ġuses_Ġthe,tf_Ġhe_Ġwanted_Ġto,tf_Ġhe_Ġwants_Ġto,tf_Ġhe_Ġwas_Ġa,tf_Ġhe_Ġwas_Ġable,tf_Ġhe_Ġwas_Ġable_Ġto,tf_Ġhe_Ġwas_Ġalso,tf_Ġhe_Ġwas_Ġan,tf_Ġhe_Ġwas_Ġborn,tf_Ġhe_Ġwas_Ġborn_Ġin,tf_Ġhe_Ġwas_Ġgoing,tf_Ġhe_Ġwas_Ġin,tf_Ġhe_Ġwas_Ġnot,tf_Ġhe_Ġwas_Ġthe,tf_Ġhe_Ġwas_Ġvery,tf_Ġhe_Ġwent_Ġto,tf_Ġhe_Ġwill_Ġbe,tf_Ġhe_Ġwill_Ġnot,tf_Ġhe_Ġwould_Ġbe,tf_Ġhe_Ġwould_Ġhave,tf_Ġhe_Ġwould_Ġnot,tf_Ġhead_Ġof_Ġstate,tf_Ġhead_Ġof_Ġthe,tf_Ġhealth_Ġand_Ġhuman,tf_Ġhealth_Ġand_Ġhuman_Ġservices,tf_Ġhealth_Ġand_Ġsafety,tf_Ġhealth_Ġand_Ġsocial,tf_Ġhealth_Ġand_Ġthe,tf_Ġhealth_Ġand_Ġwelfare,tf_Ġhealth_Ġand_Ġwell,tf_Ġhealth_Ġand_Ġwell_Ġbeing,tf_Ġhealth_Ġand_Ġwellbeing,tf_Ġhealth_Ġand_Ġwellness,tf_Ġhealth_Ġas_Ġwell,tf_Ġhealth_Ġcare_Ġand,tf_Ġhealth_Ġcare_Ġcosts,tf_Ġhealth_Ġcare_Ġdelivery,tf_Ġhealth_Ġcare_Ġfacilities,tf_Ġhealth_Ġcare_Ġfor,tf_Ġhealth_Ġcare_Ġin,tf_Ġhealth_Ġcare_Ġindustry,tf_Ġhealth_Ġcare_Ġis,tf_Ġhealth_Ġcare_Ġorganization,tf_Ġhealth_Ġcare_Ġorganizations,tf_Ġhealth_Ġcare_Ġprofessionals,tf_Ġhealth_Ġcare_Ġproviders,tf_Ġhealth_Ġcare_Ġsector,tf_Ġhealth_Ġcare_Ġservices,tf_Ġhealth_Ġcare_Ġsystem,tf_Ġhealth_Ġcare_Ġsystems,tf_Ġhealth_Ġcare_Ġthe,tf_Ġhealth_Ġcare_Ġto,tf_Ġhealth_Ġcare_Ġworkers,tf_Ġhealth_Ġeducation_Ġand,tf_Ġhealth_Ġeffects_Ġof,tf_Ġhealth_Ġin_Ġthe,tf_Ġhealth_Ġissues_Ġand,tf_Ġhealth_Ġit_Ġis,tf_Ġhealth_Ġneeds_Ġof,tf_Ġhealth_Ġof_Ġa,tf_Ġhealth_Ġof_Ġthe,tf_Ġhealth_Ġorganization_Ġwho,tf_Ġhealth_Ġproblems_Ġand,tf_Ġhealth_Ġservices_Ġresearch,tf_Ġhealth_Ġstatus_Ġof,tf_Ġhealthy_Ġpeople_Ġ2020,tf_Ġheart_Ġdisease_Ġand,tf_Ġheart_Ġof_Ġdarkness,tf_Ġheart_Ġof_Ġthe,tf_Ġheavily_Ġinfluenced_Ġby,tf_Ġheavily_Ġon_Ġthe,tf_Ġheight_Ġof_Ġthe,tf_Ġheld_Ġaccountable_Ġfor,tf_Ġheld_Ġby_Ġthe,tf_Ġheld_Ġin_Ġthe,tf_Ġheld_Ġresponsible_Ġfor,tf_Ġheld_Ġthat_Ġthe,tf_Ġhelp_Ġa_Ġlot,tf_Ġhelp_Ġa_Ġperson,tf_Ġhelp_Ġeach_Ġother,tf_Ġhelp_Ġfrom_Ġthe,tf_Ġhelp_Ġimprove_Ġthe,tf_Ġhelp_Ġin_Ġimproving,tf_Ġhelp_Ġin_Ġreducing,tf_Ġhelp_Ġin_Ġthe,tf_Ġhelp_Ġit_Ġto,tf_Ġhelp_Ġme_Ġin,tf_Ġhelp_Ġme_Ġto,tf_Ġhelp_Ġme_Ġwith,tf_Ġhelp_Ġof_Ġa,tf_Ġhelp_Ġof_Ġthe,tf_Ġhelp_Ġof_Ġthis,tf_Ġhelp_Ġof_Ġwhich,tf_Ġhelp_Ġother_Ġpeople,tf_Ġhelp_Ġpeople_Ġin,tf_Ġhelp_Ġpeople_Ġto,tf_Ġhelp_Ġreduce_Ġthe,tf_Ġhelp_Ġsomeone_Ġmake,tf_Ġhelp_Ġsomeone_Ġmake_Ġa,tf_Ġhelp_Ġstudents_Ġdevelop,tf_Ġhelp_Ġstudents_Ġlearn,tf_Ġhelp_Ġstudents_Ġto,tf_Ġhelp_Ġthe_Ġcommunity,tf_Ġhelp_Ġthe_Ġcompany,tf_Ġhelp_Ġthe_Ġcompany_Ġto,tf_Ġhelp_Ġthe_Ġenvironment,tf_Ġhelp_Ġthe_Ġfirm,tf_Ġhelp_Ġthe_Ġorganization,tf_Ġhelp_Ġthe_Ġpatient,tf_Ġhelp_Ġthe_Ġpeople,tf_Ġhelp_Ġthe_Ġstudent,tf_Ġhelp_Ġthe_Ġstudents,tf_Ġhelp_Ġthem_Ġdevelop,tf_Ġhelp_Ġthem_Ġget,tf_Ġhelp_Ġthem_Ġin,tf_Ġhelp_Ġthem_Ġlearn,tf_Ġhelp_Ġthem_Ġmake,tf_Ġhelp_Ġthem_Ġto,tf_Ġhelp_Ġthem_Ġunderstand,tf_Ġhelp_Ġthem_Ġwith,tf_Ġhelp_Ġto_Ġbuild,tf_Ġhelp_Ġto_Ġcreate,tf_Ġhelp_Ġto_Ġcreate_Ġa,tf_Ġhelp_Ġto_Ġidentify,tf_Ġhelp_Ġto_Ġimprove,tf_Ġhelp_Ġto_Ġimprove_Ġthe,tf_Ġhelp_Ġto_Ġincrease,tf_Ġhelp_Ġto_Ġmake,tf_Ġhelp_Ġto_Ġreduce,tf_Ġhelp_Ġto_Ġreduce_Ġthe,tf_Ġhelp_Ġto_Ġthe,tf_Ġhelp_Ġto_Ġunderstand,tf_Ġhelp_Ġus_Ġin,tf_Ġhelp_Ġus_Ġto,tf_Ġhelp_Ġwith_Ġthe,tf_Ġhelp_Ġyou_Ġget,tf_Ġhelp_Ġyou_Ġin,tf_Ġhelp_Ġyou_Ġmake,tf_Ġhelp_Ġyou_Ġmake_Ġa,tf_Ġhelp_Ġyou_Ġto,tf_Ġhelp_Ġyou_Ġwith,tf_Ġhelped_Ġin_Ġthe,tf_Ġhelped_Ġme_Ġto,tf_Ġhelped_Ġthe_Ġcompany,tf_Ġhelped_Ġthem_Ġto,tf_Ġhelpful_Ġfor_Ġthe,tf_Ġhelpful_Ġin_Ġthe,tf_Ġhelpful_Ġto_Ġthe,tf_Ġhelping_Ġthe_Ġcommunity,tf_Ġhelping_Ġtheir_Ġcommunities,tf_Ġhelping_Ġthem_Ġto,tf_Ġhelps_Ġin_Ġthe,tf_Ġhelps_Ġme_Ġto,tf_Ġhelps_Ġpeople_Ġto,tf_Ġhelps_Ġstudents_Ġto,tf_Ġhelps_Ġthe_Ġcompany,tf_Ġhelps_Ġthem_Ġto,tf_Ġhelps_Ġto_Ġcreate,tf_Ġhelps_Ġto_Ġidentify,tf_Ġhelps_Ġto_Ġimprove,tf_Ġhelps_Ġto_Ġreduce,tf_Ġhelps_Ġto_Ġunderstand,tf_Ġhelps_Ġus_Ġto,tf_Ġhence_Ġit_Ġis,tf_Ġhence_Ġleading_Ġto,tf_Ġhence_Ġmaking_Ġit,tf_Ġhence_Ġthe_Ġneed,tf_Ġhence_Ġthe_Ġneed_Ġfor,tf_Ġhence_Ġthe_Ġneed_Ġto,tf_Ġhence_Ġthere_Ġis,tf_Ġhence_Ġthey_Ġare,tf_Ġher_Ġability_Ġto,tf_Ġher_Ġand_Ġher,tf_Ġher_Ġas_Ġa,tf_Ġher_Ġfamily_Ġand,tf_Ġher_Ġfather_Ġs,tf_Ġher_Ġhusband_Ġand,tf_Ġher_Ġhusband_Ġs,tf_Ġher_Ġin_Ġthe,tf_Ġher_Ġlife_Ġand,tf_Ġher_Ġmother_Ġand,tf_Ġher_Ġmother_Ġs,tf_Ġher_Ġto_Ġbe,tf_Ġher_Ġto_Ġthe,tf_Ġhere_Ġand_Ġthere,tf_Ġhere_Ġare_Ġsome,tf_Ġhere_Ġi_Ġam,tf_Ġhere_Ġin_Ġthe,tf_Ġhere_Ġis_Ġa,tf_Ġhere_Ġis_Ġmy,tf_Ġhere_Ġis_Ġmy_Ġattempt,tf_Ġhere_Ġis_Ġmy_Ġessay,tf_Ġhere_Ġis_Ġthat,tf_Ġhere_Ġis_Ġthe,tf_Ġhere_Ġis_Ġto,tf_Ġhere_Ġit_Ġis,tf_Ġhey_Ġthere_Ġso,ts_automated_readability_index,ts_coleman_liau_index,ts_dale_chall_readability_score,ts_difficult_words,ts_flesch_kincaid_grade,ts_flesch_reading_ease,ts_gunning_fog,ts_lexicon_count,ts_linsear_write_formula,ts_mcalpine_eflaw,ts_monosyllable_frac,ts_polysyllable_frac,ts_sentence_count,ts_smog_index,ts_spache_readability,ts_syllable_count,ts_syllables_per_sent,ts_syllables_per_word,ts_words_per_sent,va_arousal_mean,va_arousal_std,va_dominance_mean,va_dominance_std,va_valence_mean,va_valence_std,ws_sent_len_delta_mean,ws_sent_len_delta_std,ws_sent_len_mean,ws_sent_len_std
count,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0,221612.0
mean,0.007001,5965.726445,0.802702,0.025395,0.016239,0.164901,0.021519,0.000135,0.000195,0.000355,0.0001,0.000122,0.00014,0.000316,0.000291,0.000121,0.000139,0.000111,0.000187,0.000127,0.000177,0.000439,0.000254,0.000149,0.001008,0.000123,0.000121,0.001003,0.00015,0.000115,0.000116,0.000109,0.000119,0.00036,0.000247,0.000142,0.000118,0.000164,0.000449,0.000327,0.000105,0.000318,0.000142,0.000142,0.000148,0.000345,0.000664,0.000154,0.000151,0.000248,0.000173,0.000122,0.000146,0.000123,0.000193,0.000588,0.00012,0.000433,0.000127,0.000249,0.000195,0.000145,0.001872,0.000223,0.000107,0.000239,0.000533,0.00013,0.000112,0.000166,0.000258,0.000134,0.000143,0.00011,0.000134,0.000115,0.000123,0.000287,0.000357,0.000139,0.000175,0.000196,0.00039,0.000153,0.000179,0.00015,0.000139,0.000144,0.000268,0.000101,0.000259,0.000195,0.000258,0.000215,0.000105,0.000413,0.000325,0.000693,0.00035,0.000348,0.000231,0.00016,0.000214,0.000111,0.000128,0.000235,0.000515,0.000405,0.000123,0.000172,0.000275,0.000161,0.000264,0.000264,0.000245,9.7e-05,0.000394,0.00025,0.00021,0.000399,0.000174,0.000224,8.9e-05,0.000278,0.000277,0.00011,9.4e-05,0.000111,0.00031,9e-05,0.000132,0.000138,0.000122,0.000187,9.7e-05,0.000144,6.9e-05,0.000133,0.0002,0.000276,0.00011,0.00035,0.000377,0.000112,0.00013,0.000112,0.000119,9.8e-05,9.3e-05,0.000159,0.000116,0.000124,0.000107,0.000114,0.000468,0.000133,0.000142,0.00011,0.00014,6.9e-05,0.00014,7.8e-05,0.000208,0.000124,0.000157,0.000115,0.000132,0.00023,0.000154,0.000132,9.6e-05,0.000162,0.000134,0.000259,0.000167,0.000174,8.9e-05,0.000122,0.000492,7.7e-05,0.000186,0.000459,0.000203,0.000297,0.000596,0.000108,0.000133,0.000166,0.000191,0.000211,0.000209,0.000189,0.000164,0.000232,0.000135,0.000264,0.00021,0.000285,0.000136,0.000138,7.4e-05,0.00011,0.000103,0.000135,0.000204,0.000256,0.000171,0.000146,0.000299,0.000152,0.00012,0.000576,0.000141,0.00014,0.000123,0.00018,0.000117,0.000106,0.00024,0.000118,0.000142,0.000138,0.000315,0.000149,0.000131,0.000134,0.000135,0.000324,0.000206,0.000179,0.000248,0.000304,0.000142,0.000308,0.000209,0.000131,0.000445,0.000123,9.5e-05,0.000113,0.000118,0.000115,0.000165,0.000121,0.000142,0.000266,0.000142,0.000154,0.000136,0.000118,0.000284,0.000113,9.7e-05,0.000102,0.000131,0.000153,0.000239,0.000428,0.000115,0.000106,0.000237,0.000119,0.000141,0.000154,0.00012,0.000167,0.000118,0.000188,0.000147,0.000149,0.000176,0.00024,0.000165,0.000179,0.000113,0.000171,0.000164,0.000135,0.000144,0.000341,0.000179,0.000154,0.000171,0.000481,0.000136,0.0002,0.000246,0.000381,9.5e-05,0.000234,0.000242,12.585382,11.386264,8.19298,159.381882,10.705375,53.626904,11.441067,950.561526,12.163027,27.494675,0.645215,0.152362,50.153976,12.730854,5.110836,1543.855085,29.810389,1.570054,18.975353,0.448143,0.153978,0.580518,0.164823,0.631729,0.182594,9.789777,8.315752,19.064102,10.486661
std,0.00985,6241.571249,0.017911,0.009563,0.005521,0.013395,0.013531,0.003541,0.004181,0.005444,0.002845,0.003249,0.003756,0.005269,0.005031,0.003542,0.003524,0.003282,0.00409,0.00375,0.004205,0.006165,0.00478,0.003742,0.008858,0.003433,0.003382,0.008704,0.003797,0.003202,0.003198,0.003294,0.003343,0.005515,0.004589,0.0036,0.003441,0.003892,0.006344,0.00532,0.003191,0.005129,0.003601,0.003764,0.003585,0.005454,0.007489,0.003641,0.003606,0.004492,0.003985,0.003647,0.003824,0.003509,0.004072,0.006815,0.003419,0.005991,0.003576,0.004718,0.00444,0.003562,0.010863,0.00415,0.002954,0.004105,0.005892,0.003255,0.002948,0.003647,0.004432,0.003427,0.003495,0.003069,0.003622,0.003318,0.003213,0.005413,0.005632,0.003526,0.003692,0.003811,0.005426,0.00369,0.003844,0.003696,0.003957,0.003588,0.004688,0.003057,0.005605,0.004247,0.004689,0.004476,0.0031,0.006126,0.005516,0.007543,0.005486,0.005468,0.004519,0.003853,0.004358,0.003238,0.003751,0.004488,0.006428,0.005874,0.003614,0.00427,0.004951,0.004001,0.004902,0.004987,0.004713,0.003196,0.005366,0.005405,0.004826,0.006368,0.004225,0.004311,0.003136,0.00507,0.005058,0.003379,0.003239,0.003142,0.005234,0.003087,0.003896,0.003691,0.003356,0.004112,0.003332,0.003708,0.00294,0.003874,0.004449,0.00531,0.00344,0.005781,0.006326,0.003431,0.003231,0.003202,0.003671,0.003188,0.003106,0.003704,0.003351,0.003341,0.003266,0.003051,0.005666,0.003338,0.003627,0.00314,0.003448,0.003495,0.003816,0.004268,0.004094,0.003422,0.003502,0.003357,0.003546,0.004048,0.0035,0.003399,0.003054,0.004393,0.003722,0.005407,0.003983,0.00394,0.00262,0.002967,0.005646,0.002341,0.004618,0.007288,0.005208,0.004703,0.006176,0.00298,0.00324,0.00455,0.004883,0.00474,0.004507,0.005057,0.004703,0.00536,0.003936,0.00541,0.005302,0.004344,0.003084,0.004245,0.002427,0.00276,0.003095,0.00408,0.004851,0.005099,0.004265,0.003938,0.005042,0.004182,0.003496,0.006784,0.003786,0.003989,0.003735,0.00421,0.003647,0.00296,0.004581,0.003183,0.003992,0.003611,0.005572,0.003757,0.003324,0.003268,0.004092,0.006412,0.004849,0.004747,0.00566,0.00616,0.00433,0.00673,0.005317,0.003142,0.00693,0.00298,0.002766,0.00324,0.003142,0.003316,0.004944,0.004961,0.003461,0.004301,0.00434,0.003949,0.004147,0.002917,0.004928,0.00334,0.002855,0.003164,0.003397,0.003508,0.005531,0.005296,0.002969,0.002828,0.004183,0.003094,0.003264,0.003375,0.003026,0.003732,0.003422,0.004177,0.003732,0.004073,0.004031,0.004982,0.003803,0.00405,0.003474,0.004408,0.00385,0.003643,0.00408,0.006718,0.004609,0.004251,0.004576,0.007273,0.003597,0.004751,0.004195,0.006068,0.002922,0.004784,0.005035,5.368936,2.752176,1.085742,140.554245,4.290843,16.841059,4.195405,962.11098,5.047263,14.684708,0.075219,0.057724,50.310293,2.257807,1.467629,1628.733001,9.569697,0.16387,6.549414,0.029354,0.019396,0.038843,0.014807,0.043931,0.030532,7.678156,5.760149,10.566523,6.125455
min,0.0,767.0,0.387132,0.0,0.0,0.113198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.3,0.56,0.74,2.0,0.6,-1254.800049,2.7,106.0,0.818182,6.6,0.367925,0.0,1.0,0.0,2.07,177.0,6.481818,1.044855,5.165803,0.273268,0.070332,0.364898,0.074764,0.383705,0.086386,0.0,0.0,3.364583,0.0
1%,0.0,876.11,0.747467,0.010554,0.007808,0.140908,0.006141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.7,4.7,5.81,14.0,4.2,22.549999,6.08,151.0,4.777778,15.6,0.499614,0.03213,7.0,7.2,3.24,220.0,14.674299,1.209363,10.764706,0.374117,0.113492,0.490502,0.128913,0.511928,0.123592,4.151587,2.871617,10.2,3.971511
5%,0.0,1113.0,0.773763,0.014257,0.010052,0.146473,0.008511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.1,6.55,6.41,24.0,6.1,31.309999,7.75,194.0,6.0,18.6,0.537269,0.05483,10.0,8.8,3.85,280.0,18.76,1.286335,13.0,0.400779,0.125767,0.515888,0.141038,0.55309,0.138389,5.266667,3.901057,12.233572,5.184129
10%,0.0,1396.0,0.782778,0.016114,0.011198,0.149538,0.010172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.4,7.54,6.86,32.0,7.2,35.57,8.61,244.0,6.857143,20.200001,0.556373,0.072089,13.0,9.7,4.16,351.0,21.078207,1.340996,14.176471,0.412587,0.131816,0.529456,0.146807,0.574162,0.146159,5.89567,4.499135,13.345454,5.992673
20%,0.0,1904.0,0.791171,0.018487,0.012605,0.153454,0.013261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,8.94,7.41,47.0,8.5,41.5,9.59,321.0,7.875,22.200001,0.580328,0.097923,17.0,10.8,4.48,482.0,23.885714,1.419355,15.583333,0.425564,0.138607,0.546735,0.15326,0.597298,0.15601,6.732394,5.292834,14.769231,7.124855
30%,0.001003,2371.0,0.796525,0.020397,0.013655,0.156561,0.015954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,10.03,7.73,65.0,9.4,45.150002,10.23,400.0,8.833333,23.6,0.599364,0.119792,21.0,11.7,4.7,604.0,25.857143,1.482116,16.594595,0.434135,0.143467,0.559747,0.157616,0.612412,0.163921,7.416667,5.945591,15.904762,8.041165
40%,0.002517,3173.0,0.800752,0.022169,0.014609,0.159581,0.01827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.9,10.91,7.99,89.0,10.1,49.080002,10.77,548.0,10.666667,24.9,0.616948,0.13881,27.0,12.4,4.88,805.0,27.571428,1.535714,17.481823,0.441382,0.147763,0.570885,0.161264,0.624457,0.171503,8.061919,6.566246,16.944445,8.872805


In [18]:
%%time
df.to_parquet(f"output/features.parquet", index=False)
assert df.notna().all(axis=None)

CPU times: user 1min 35s, sys: 5.85 s, total: 1min 41s
Wall time: 1min 41s


In [19]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 1:17:40.318822
