In [1]:
import os
import json
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scipy
import scml
from scml import pandasx as pdx
from daigt.preprocess import en as pen
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
text_col = "text_bsc"

In [3]:
df = pd.read_parquet("input/preprocess.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39515 entries, 0 to 39514
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   essay_id        39515 non-null  int64 
 1   generated       39515 non-null  int8  
 2   source          39515 non-null  object
 3   prompt          39515 non-null  object
 4   text            39515 non-null  object
 5   text_bsc        39515 non-null  object
 6   text_bow        39515 non-null  object
 7   text_bow_len    39515 non-null  int32 
 8   prompt_bsc      39515 non-null  object
 9   prompt_bow      39515 non-null  object
 10  prompt_bow_len  39515 non-null  int32 
dtypes: int32(2), int64(1), int8(1), object(7)
memory usage: 2.8+ MB


# Character level features

In [4]:
%%time
col = "ch_len"
df[col] = df[text_col].str.len()
df[col] = df[col].astype(np.int32)

def digit_frac(row) -> float:
    return pen.digit_frac(row[text_col])


def letter_frac(row) -> float:
    return pen.letter_frac(row[text_col])


def space_frac(row) -> float:
    return pen.space_frac(row[text_col])


def punc_frac(row) -> float:
    return pen.punc_frac(row[text_col])


def upper_frac(row) -> float:
    return pen.upper_frac(row[text_col])


def repeat_char_frac(row) -> float:
    return pen.repeat_char_frac(row[text_col])


def repeat_substring_frac(row) -> float:
    return pen.repeat_substring_frac(row[text_col])


char_fns: Dict[str, Callable] = {
    "ch_digit_frac": digit_frac,
    "ch_letter_frac": letter_frac,
    "ch_space_frac": space_frac,
    "ch_punc_frac": punc_frac,
    "ch_upper_frac": upper_frac,
    "ch_repeat_char_frac": repeat_char_frac,
    "ch_repeat_substring_frac": repeat_substring_frac,
}

for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ch_digit_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39515/39515 [00:01<00:00, 22320.63it/s]


ch_letter_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39515/39515 [00:01<00:00, 20269.40it/s]


ch_space_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39515/39515 [00:01<00:00, 21624.93it/s]


ch_punc_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39515/39515 [00:01<00:00, 21839.48it/s]


ch_upper_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39515/39515 [00:01<00:00, 22365.99it/s]


ch_repeat_char_frac


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39515/39515 [00:05<00:00, 7412.32it/s]


ch_repeat_substring_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39515/39515 [35:14<00:00, 18.69it/s]

CPU times: user 35min 22s, sys: 4.17 s, total: 35min 26s
Wall time: 35min 28s





# Review Data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39515 entries, 0 to 39514
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   essay_id                  39515 non-null  int64  
 1   generated                 39515 non-null  int8   
 2   source                    39515 non-null  object 
 3   prompt                    39515 non-null  object 
 4   text                      39515 non-null  object 
 5   text_bsc                  39515 non-null  object 
 6   text_bow                  39515 non-null  object 
 7   text_bow_len              39515 non-null  int32  
 8   prompt_bsc                39515 non-null  object 
 9   prompt_bow                39515 non-null  object 
 10  prompt_bow_len            39515 non-null  int32  
 11  ch_len                    39515 non-null  int32  
 12  ch_digit_frac             39515 non-null  float32
 13  ch_letter_frac            39515 non-null  float32
 14  ch_spa

In [6]:
df.describe(percentiles=percentiles)

Unnamed: 0,essay_id,generated,text_bow_len,prompt_bow_len,ch_len,ch_digit_frac,ch_letter_frac,ch_space_frac,ch_punc_frac,ch_upper_frac,ch_repeat_char_frac,ch_repeat_substring_frac
count,39515.0,39515.0,39515.0,39515.0,39515.0,39515.0,39515.0,39515.0,39515.0,39515.0,39515.0,39515.0
mean,19757.0,0.252891,2195.112818,87.917854,2229.710085,0.001559,0.79973,0.17767,0.021042,0.015673,0.017053,0.002284
std,11407.14228,0.434675,1006.48068,161.0615,1022.272265,0.003094,0.015733,0.013207,0.00884,0.016745,0.005114,0.008225
min,0.0,0.0,234.0,2.0,238.0,0.0,0.501969,0.005906,0.0,0.0,0.00089,0.0
1%,395.14,0.0,773.0,2.0,784.14,0.0,0.759425,0.147226,0.007119,0.002213,0.007046,0.0
5%,1975.7,0.0,951.0,2.0,964.0,0.0,0.773998,0.154341,0.010773,0.006222,0.009468,0.0
10%,3951.4,0.0,1100.0,2.0,1116.0,0.0,0.780463,0.159455,0.012694,0.007421,0.010953,0.0
20%,7902.8,0.0,1359.0,2.0,1380.0,0.0,0.787473,0.166591,0.015101,0.008875,0.012829,0.0
30%,11854.2,0.0,1582.0,2.0,1606.0,0.0,0.792067,0.171273,0.016771,0.010243,0.014262,0.0
40%,15805.6,0.0,1792.0,2.0,1822.0,0.0,0.795972,0.175124,0.018211,0.011763,0.01548,0.001138


In [7]:
%%time
df.to_parquet(f"output/features.parquet", index=False)
assert df.notna().all(axis=None)

CPU times: user 538 ms, sys: 47 ms, total: 585 ms
Wall time: 587 ms


In [8]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:35:29.821422
