In [1]:
import os
import json
import gc
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedGroupKFold
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scml
from scml import pandasx as pdx

In [2]:
version = "02"
files = [
    #Path("input/comp.parquet"),
    Path("input/persuade2.parquet"),
]
n_splits = 20

In [3]:
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [4]:
cols = ["essay_id", "score", "prompt_title", "prompt", "full_text"]
cmb = None
for filepath in files:
    df = pd.read_parquet(filepath)
    df = df[cols]
    df["source"] = filepath.stem
    if cmb is None:
        cmb = df
    else:
        cmb = pd.concat([cmb, df], ignore_index=True)
cmb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25996 entries, 0 to 25995
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   essay_id      25996 non-null  object
 1   score         25996 non-null  int8  
 2   prompt_title  25996 non-null  object
 3   prompt        25996 non-null  object
 4   full_text     25996 non-null  object
 5   source        25996 non-null  object
dtypes: int8(1), object(5)
memory usage: 1.0+ MB


# Train/Test Split

In [5]:
splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True)
dummy = np.zeros(len(cmb))
for ti, vi in splitter.split(dummy, y=cmb["score"], groups=cmb["prompt_title"]):
    tra = cmb.iloc[ti]
    val = cmb.iloc[vi]
    break
tra.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24370 entries, 0 to 25995
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   essay_id      24370 non-null  object
 1   score         24370 non-null  int8  
 2   prompt_title  24370 non-null  object
 3   prompt        24370 non-null  object
 4   full_text     24370 non-null  object
 5   source        24370 non-null  object
dtypes: int8(1), object(5)
memory usage: 1.1+ MB


In [6]:
val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1626 entries, 15073 to 16698
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   essay_id      1626 non-null   object
 1   score         1626 non-null   int8  
 2   prompt_title  1626 non-null   object
 3   prompt        1626 non-null   object
 4   full_text     1626 non-null   object
 5   source        1626 non-null   object
dtypes: int8(1), object(5)
memory usage: 77.8+ KB


In [7]:
#pdx.value_counts(tra["str_level"])

In [8]:
#pdx.value_counts(val["str_level"])

In [9]:
tra.to_parquet(f"output/tra_{version}.parquet", index=False)
val.to_parquet(f"output/val_{version}.parquet", index=False)
assert tra.notna().all(axis=None)
assert val.notna().all(axis=None)

In [10]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:00.559410
