In [1]:
import os
import json
import gc
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scml
from scml import pandasx as pdx

In [2]:
version = "02"
files = [
    #Path("input/comp.parquet"),
    Path("input/persuade2.parquet"),
]
n_splits = 20

In [3]:
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [4]:
cols = ["essay_id", "score", "full_text"]
cmb = None
for filepath in files:
    df = pd.read_parquet(filepath)
    df = df[cols]
    df["source"] = filepath.stem
    if cmb is None:
        cmb = df
    else:
        cmb = pd.concat([cmb, df], ignore_index=True)
cmb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25996 entries, 0 to 25995
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   25996 non-null  object
 1   score      25996 non-null  int8  
 2   full_text  25996 non-null  object
 3   source     25996 non-null  object
dtypes: int8(1), object(3)
memory usage: 634.8+ KB


# Train/Test Split

In [5]:
splitter = StratifiedKFold(n_splits=n_splits, shuffle=True)
dummy = np.zeros(len(cmb))
str_levels = []
for t in tqdm(cmb.itertuples()):
    score = str(getattr(t, "score"))
    src = str(getattr(t, "source"))
    str_levels.append(f"{score}_{src}")
cmb["str_level"] = str_levels
for ti, vi in splitter.split(dummy, y=str_levels):
    tra = cmb.iloc[ti]
    val = cmb.iloc[vi]
    break
tra.info()

25996it [00:00, 1148499.81it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 24696 entries, 0 to 25995
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   24696 non-null  object
 1   score      24696 non-null  int8  
 2   full_text  24696 non-null  object
 3   source     24696 non-null  object
 4   str_level  24696 non-null  object
dtypes: int8(1), object(4)
memory usage: 988.8+ KB





In [6]:
val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1300 entries, 17 to 25986
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   1300 non-null   object
 1   score      1300 non-null   int8  
 2   full_text  1300 non-null   object
 3   source     1300 non-null   object
 4   str_level  1300 non-null   object
dtypes: int8(1), object(4)
memory usage: 52.1+ KB


In [7]:
pdx.value_counts(tra["str_level"])

Unnamed: 0_level_0,count,percent
str_level,Unnamed: 1_level_1,Unnamed: 2_level_1
3_persuade2,7949,0.321874
4_persuade2,6395,0.258949
2_persuade2,5414,0.219226
5_persuade2,3132,0.126822
1_persuade2,977,0.039561
6_persuade2,829,0.033568


In [8]:
pdx.value_counts(val["str_level"])

Unnamed: 0_level_0,count,percent
str_level,Unnamed: 1_level_1,Unnamed: 2_level_1
3_persuade2,419,0.322308
4_persuade2,336,0.258462
2_persuade2,285,0.219231
5_persuade2,165,0.126923
1_persuade2,51,0.039231
6_persuade2,44,0.033846


In [9]:
tra.to_parquet(f"output/tra_{version}.parquet", index=False)
val.to_parquet(f"output/val_{version}.parquet", index=False)
assert tra.notna().all(axis=None)
assert val.notna().all(axis=None)

In [10]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:00.569166
