In [1]:
import os
import json
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scipy
import scml
from scml import pandasx as pdx
from daigt.preprocess import en as pen
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [2]:
text_col = "text_bsc"

In [3]:
df = pd.read_parquet("input/essays_tra.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            1378 non-null   object
 1   prompt_id     1378 non-null   int64 
 2   text          1378 non-null   object
 3   generated     1378 non-null   int8  
 4   text_bsc      1378 non-null   object
 5   text_bow      1378 non-null   object
 6   text_bow_len  1378 non-null   int16 
dtypes: int16(1), int64(1), int8(1), object(4)
memory usage: 58.0+ KB


# Character level features

In [4]:
%%time
col = "ch_len"
df[col] = df[text_col].str.len()
df[col] = df[col].astype(np.int32)

def digit_frac(row) -> float:
    return pen.digit_frac(row[text_col])


def letter_frac(row) -> float:
    return pen.letter_frac(row[text_col])


def space_frac(row) -> float:
    return pen.space_frac(row[text_col])


def punc_frac(row) -> float:
    return pen.punc_frac(row[text_col])


def upper_frac(row) -> float:
    return pen.upper_frac(row[text_col])


def repeat_char_frac(row) -> float:
    return pen.repeat_char_frac(row[text_col])


def repeat_substring_frac(row) -> float:
    return pen.repeat_substring_frac(row[text_col])


char_fns: Dict[str, Callable] = {
    "ch_digit_frac": digit_frac,
    "ch_letter_frac": letter_frac,
    "ch_space_frac": space_frac,
    "ch_punc_frac": punc_frac,
    "ch_upper_frac": upper_frac,
    "ch_repeat_char_frac": repeat_char_frac,
    "ch_repeat_substring_frac": repeat_substring_frac,
}

for col, fn in char_fns.items():
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    df[col] = df[col].astype(np.float32)

ch_digit_frac


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 15967.57it/s]


ch_letter_frac


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 14144.35it/s]


ch_space_frac


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 15000.81it/s]


ch_punc_frac


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 15532.58it/s]


ch_upper_frac


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 15524.28it/s]


ch_repeat_char_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:00<00:00, 5237.13it/s]


ch_repeat_substring_frac


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [01:07<00:00, 20.50it/s]

CPU times: user 1min 7s, sys: 163 ms, total: 1min 7s
Wall time: 1min 7s





# Review

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        1378 non-null   object 
 1   prompt_id                 1378 non-null   int64  
 2   text                      1378 non-null   object 
 3   generated                 1378 non-null   int8   
 4   text_bsc                  1378 non-null   object 
 5   text_bow                  1378 non-null   object 
 6   text_bow_len              1378 non-null   int16  
 7   ch_len                    1378 non-null   int32  
 8   ch_digit_frac             1378 non-null   float32
 9   ch_letter_frac            1378 non-null   float32
 10  ch_space_frac             1378 non-null   float32
 11  ch_punc_frac              1378 non-null   float32
 12  ch_upper_frac             1378 non-null   float32
 13  ch_repeat_char_frac       1378 non-null   float32
 14  ch_repea

In [6]:
df.describe(percentiles=percentiles)

Unnamed: 0,prompt_id,generated,text_bow_len,ch_len,ch_digit_frac,ch_letter_frac,ch_space_frac,ch_punc_frac,ch_upper_frac,ch_repeat_char_frac,ch_repeat_substring_frac
count,1378.0,1378.0,1378.0,1378.0,1378.0,1378.0,1378.0,1378.0,1378.0,1378.0,1378.0
mean,0.486212,0.002177,585.858491,3165.206821,0.003984,0.798985,0.175595,0.021436,0.017922,0.016204,0.002257
std,0.499991,0.046625,180.131658,920.312788,0.00336,0.009016,0.007268,0.005272,0.011855,0.003768,0.001918
min,0.0,0.0,237.0,1356.0,0.0,0.760303,0.147178,0.006163,0.0,0.007417,0.0
1%,0.0,0.0,311.77,1741.77,0.0,0.776091,0.160862,0.010023,0.006092,0.009212,0.0
5%,0.0,0.0,369.85,1998.7,0.0,0.783611,0.164497,0.012942,0.009578,0.011132,0.0
10%,0.0,0.0,401.0,2190.1,0.0,0.787834,0.166874,0.014732,0.011386,0.012077,0.0
20%,0.0,0.0,448.0,2443.0,0.001104,0.791445,0.169342,0.017033,0.01331,0.013293,0.00087
30%,0.0,0.0,484.1,2636.1,0.001753,0.794636,0.171425,0.018505,0.014698,0.014226,0.001165
40%,0.0,0.0,516.0,2778.0,0.002414,0.796908,0.17319,0.019932,0.015853,0.015095,0.001496


In [7]:
%%time
df.to_parquet(f"output/features.parquet", index=False)
assert df.notna().all(axis=None)

CPU times: user 30 ms, sys: 4.16 ms, total: 34.2 ms
Wall time: 31.3 ms


In [8]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:01:08.057794
