In [1]:
import os
import json
import gc
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from typing import Dict, List, Set, Tuple, NamedTuple, Callable
import scml
from scml import pandasx as pdx

In [2]:
version = "v01"
files = [
    Path("input/comp.parquet"),
    #Path("input/nicholasbroad/nb01.json"),
    #Path("input/valentinwerner/vw03.json"),
    #Path("input/valentinwerner/pjm12.json"),
]
n_splits = 20

In [3]:
tim = scml.Timer()
tim.start()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
info = np.iinfo(np.int16)
print(f"int16, min={info.min}, max={info.max}")

int16, min=-32768, max=32767


In [4]:
cols = ["essay_id", "score", "clean_text"]
cmb = None
for filepath in files:
    df = pd.read_parquet(filepath)
    df = df[cols]
    df["source"] = filepath.stem
    if cmb is None:
        cmb = df
    else:
        cmb = pd.concat([cmb, df], ignore_index=True)
cmb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   essay_id    17307 non-null  object
 1   score       17307 non-null  int8  
 2   clean_text  17307 non-null  object
 3   source      17307 non-null  object
dtypes: int8(1), object(3)
memory usage: 422.7+ KB


# Train/Test Split

In [5]:
splitter = StratifiedKFold(n_splits=n_splits, shuffle=True)
dummy = np.zeros(len(cmb))
str_levels = []
for t in tqdm(cmb.itertuples()):
    score = str(getattr(t, "score"))
    src = str(getattr(t, "source"))
    str_levels.append(f"{score}_{src}")
cmb["str_level"] = str_levels
for ti, vi in splitter.split(dummy, y=str_levels):
    tra = cmb.iloc[ti]
    val = cmb.iloc[vi]
    break
tra.info()

17307it [00:00, 1137181.12it/s]

<class 'pandas.core.frame.DataFrame'>
Index: 16441 entries, 0 to 17306
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   essay_id    16441 non-null  object
 1   score       16441 non-null  int8  
 2   clean_text  16441 non-null  object
 3   source      16441 non-null  object
 4   str_level   16441 non-null  object
dtypes: int8(1), object(4)
memory usage: 658.3+ KB





In [6]:
val.info()

<class 'pandas.core.frame.DataFrame'>
Index: 866 entries, 12 to 17291
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   essay_id    866 non-null    object
 1   score       866 non-null    int8  
 2   clean_text  866 non-null    object
 3   source      866 non-null    object
 4   str_level   866 non-null    object
dtypes: int8(1), object(4)
memory usage: 34.7+ KB


In [7]:
pdx.value_counts(tra["str_level"])

Unnamed: 0_level_0,count,percent
str_level,Unnamed: 1_level_1,Unnamed: 2_level_1
3_comp,5966,0.362873
2_comp,4487,0.272915
4_comp,3729,0.226811
1_comp,1189,0.072319
5_comp,922,0.056079
6_comp,148,0.009002


In [8]:
pdx.value_counts(val["str_level"])

Unnamed: 0_level_0,count,percent
str_level,Unnamed: 1_level_1,Unnamed: 2_level_1
3_comp,314,0.362587
2_comp,236,0.272517
4_comp,197,0.227483
1_comp,63,0.072748
5_comp,48,0.055427
6_comp,8,0.009238


In [9]:
tra.to_parquet(f"output/tra_{version}.parquet", index=False)
val.to_parquet(f"output/val_{version}.parquet", index=False)
assert tra.notna().all(axis=None)
assert val.notna().all(axis=None)

In [10]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:00:00.510753
