In [1]:
from pathlib import Path
import os
import sys
import gc
import shutil
import json
import math
import uuid
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any
import textstat
import scml
from scml import pandasx as pdx
import lalaes2 as mylib

In [2]:
version = "01"
n_splits = 5
text_col = "clean_text"
features = []

In [3]:
class ModelConf(NamedTuple):
    name: str
    directory: Path
    model_max_length: int
    batch_size: int
    model_class: str = "auto"


models_conf: List[ModelConf] = [
    ModelConf(
        name="deberta_base", 
        directory=Path("models/aes2/deberta_v3_base/20240610_172052"),
        model_max_length=512,
        batch_size=128,
    ),
]

In [4]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [5]:
df = pd.read_parquet(f"input/val_{version}.parquet")
df = df.drop(columns=["source", "str_level"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   essay_id    866 non-null    object
 1   score       866 non-null    int8  
 2   clean_text  866 non-null    object
dtypes: int8(1), object(2)
memory usage: 14.5+ KB


# Base Model Inference

In [6]:
for mc in models_conf:
    print(mc.name)
    df[mc.name] = mylib.predict_holistic_score(
        ds=mylib.Aes2Dataset(
            tokenizer=AutoTokenizer.from_pretrained(mc.directory, model_max_length=mc.model_max_length),
            texts=df[text_col].tolist(),
        ),
        model=AutoModelForSequenceClassification.from_pretrained(mc.directory),
        batch_size=mc.batch_size,
        device=torch.device("cuda:1"),
        progress_bar=True,
    )
    features.append(mc.name)

deberta_base


predict hms score: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:10<00:00,  1.54s/it]


# Character & Word-level Features

In [7]:
%%time
col = "cw_len"
df[col] = df[text_col].str.len()
features.append(col)

CPU times: user 977 µs, sys: 247 µs, total: 1.22 ms
Wall time: 1.16 ms


In [8]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row[text_col])


def letter_frac(row) -> float:
    return mylib.letter_frac(row[text_col])


def space_frac(row) -> float:
    return mylib.space_frac(row[text_col])


def punc_frac(row) -> float:
    return mylib.punc_frac(row[text_col])


def upper_frac(row) -> float:
    return mylib.upper_frac(row[text_col])


def repeat_char_frac(row) -> float:
    return mylib.repeat_char_frac(row[text_col])


def repeat_substring_frac(row) -> float:
    return mylib.repeat_substring_frac(row[text_col])


def unique_word_frac(row) -> float:
    return mylib.unique_word_frac(row[text_col])


sf = mylib.StopwordFraction()


def stopword_frac(row) -> float:
    return sf(row[text_col])



cw_fns: List[Tuple[str, Callable]] = [
    ("cw_digit_frac", digit_frac),
    ("cw_letter_frac", letter_frac),
    ("cw_space_frac", space_frac),
    ("cw_punc_frac", punc_frac),
    ("cw_upper_frac", upper_frac),
    ("cw_repeat_char_frac", repeat_char_frac),
    ("cw_repeat_substring_frac", repeat_substring_frac),
    ("cw_unique_word_frac", unique_word_frac),
    ("cw_stopword_frac", stopword_frac),
]   
for col, fn in cw_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    features.append(col)
df[features] = df[features].astype(np.float32)

cw_digit_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 15239.02it/s]


cw_letter_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 13538.54it/s]


cw_space_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 14315.54it/s]


cw_punc_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 13254.76it/s]


cw_upper_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 13104.50it/s]


cw_repeat_char_frac


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 6056.90it/s]


cw_repeat_substring_frac


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:37<00:00, 23.36it/s]


cw_unique_word_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 33378.06it/s]


cw_stopword_frac


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 28849.04it/s]


# Textstat Features

In [9]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row[text_col])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row[text_col])


def sentence_count(row) -> int:
    return textstat.sentence_count(row[text_col])


def syllables_per_word(row) -> float:
    return row["ts_syllable_count"] / (row["ts_lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["ts_syllable_count"] / (row["ts_sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["ts_lexicon_count"] / (row["ts_sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row[text_col])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row[text_col])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row[text_col])


def smog_index(row) -> float:
    return textstat.smog_index(row[text_col])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row[text_col])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row[text_col])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row[text_col])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row[text_col])


def text_standard(row) -> float:
    return textstat.text_standard(row[text_col], float_output=True)


def mcalpine_eflaw(row) -> float:
    return textstat.mcalpine_eflaw(row[text_col])


textstat_fns: List[Tuple[str, Callable]] = [
    ("ts_syllable_count", syllable_count),
    ("ts_lexicon_count", lexicon_count),
    ("ts_sentence_count", sentence_count),
    ("ts_syllables_per_word", syllables_per_word),
    ("ts_syllables_per_sent", syllables_per_sent),
    ("ts_words_per_sent", words_per_sent),
    ("ts_flesch_reading_ease", flesch_reading_ease),
    ("ts_flesch_kincaid_grade", flesch_kincaid_grade),
    ("ts_gunning_fog", gunning_fog),
    ("ts_smog_index", smog_index),
    ("ts_automated_readability_index", automated_readability_index),
    ("ts_coleman_liau_index", coleman_liau_index),
    ("ts_linsear_write_formula", linsear_write_formula),
    ("ts_dale_chall_readability_score", dale_chall_readability_score),
    ("ts_text_standard", text_standard),
    ("ts_mcalpine_eflaw", mcalpine_eflaw),
]    
for col, fn in textstat_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    features.append(col)
df[features] = df[features].astype(np.float32)

ts_syllable_count


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 2472.00it/s]


ts_lexicon_count


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 27821.31it/s]


ts_sentence_count


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 16174.32it/s]


ts_syllables_per_word


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 169779.72it/s]


ts_syllables_per_sent


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 138535.69it/s]


ts_words_per_sent


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 87852.64it/s]


ts_flesch_reading_ease


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 3495.48it/s]


ts_flesch_kincaid_grade


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 3504.33it/s]


ts_gunning_fog


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 2859.57it/s]


ts_smog_index


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 2718.79it/s]


ts_automated_readability_index


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 8102.33it/s]


ts_coleman_liau_index


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 7259.50it/s]


ts_linsear_write_formula


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 8381.39it/s]


ts_dale_chall_readability_score


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 2894.90it/s]


ts_text_standard


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:01<00:00, 826.18it/s]


ts_mcalpine_eflaw


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 866/866 [00:00<00:00, 9131.46it/s]


# Train/Test Split

In [10]:
splitter = StratifiedKFold(n_splits=n_splits, shuffle=True)
dummy = np.zeros(len(df))
col = "is_test"
df[col] = 0
for _, vi in splitter.split(dummy, y=df["score"].tolist()):
    df[col].iloc[vi] = 1
    break
df[col] = df[col].astype(np.int8)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df[col].iloc[vi] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].iloc[vi] = 1


# Review Data

In [11]:
df[features] = df[features].astype(np.float32)
df = df.drop(columns=["clean_text"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 866 entries, 0 to 865
Data columns (total 30 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   essay_id                         866 non-null    object 
 1   score                            866 non-null    int8   
 2   deberta_base                     866 non-null    float32
 3   cw_len                           866 non-null    float32
 4   cw_digit_frac                    866 non-null    float32
 5   cw_letter_frac                   866 non-null    float32
 6   cw_space_frac                    866 non-null    float32
 7   cw_punc_frac                     866 non-null    float32
 8   cw_upper_frac                    866 non-null    float32
 9   cw_repeat_char_frac              866 non-null    float32
 10  cw_repeat_substring_frac         866 non-null    float32
 11  cw_unique_word_frac              866 non-null    float32
 12  cw_stopword_frac      

In [12]:
df.describe(percentiles=percentiles)

Unnamed: 0,score,deberta_base,cw_len,cw_digit_frac,cw_letter_frac,cw_space_frac,cw_punc_frac,cw_upper_frac,cw_repeat_char_frac,cw_repeat_substring_frac,cw_unique_word_frac,cw_stopword_frac,ts_syllable_count,ts_lexicon_count,ts_sentence_count,ts_syllables_per_word,ts_syllables_per_sent,ts_words_per_sent,ts_flesch_reading_ease,ts_flesch_kincaid_grade,ts_gunning_fog,ts_smog_index,ts_automated_readability_index,ts_coleman_liau_index,ts_linsear_write_formula,ts_dale_chall_readability_score,ts_text_standard,ts_mcalpine_eflaw,is_test
count,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0,866.0
mean,2.948037,3.145202,2060.63501,0.002611,0.795851,0.181177,0.020361,0.018148,0.015119,0.002907,0.533583,0.504496,509.431885,370.844116,19.625866,1.362833,26.972654,19.788095,68.661377,9.188222,10.897321,10.10739,11.009238,8.253776,10.499711,7.593972,9.80485,31.437298,0.200924
std,1.046139,0.9295,849.165894,0.00362,0.011515,0.010704,0.00671,0.009073,0.004385,0.007891,0.075234,0.048059,212.012878,147.202759,8.809137,0.09297,20.295984,15.702071,32.33408,12.169928,12.454148,1.943423,15.534221,1.855127,5.296445,1.734166,12.0592,45.39986,0.400923
min,1.0,1.406469,732.0,0.0,0.743163,0.155241,0.0,0.0,0.004212,0.0,0.264228,0.354054,190.0,151.0,1.0,1.108434,10.962963,8.5,-628.880005,2.2,3.91,0.0,2.8,3.19,2.8,5.38,0.0,12.1,0.0
1%,1.0,1.488527,843.0,0.0,0.76581,0.159538,0.006005,0.000906,0.006511,0.0,0.361339,0.385154,205.6,159.3,4.0,1.153609,13.469216,10.195938,37.6915,3.265,5.4165,5.765,3.565,3.9595,4.235606,5.8,5.0,14.765,0.0
5%,1.0,1.790974,969.0,0.0,0.776374,0.164993,0.010542,0.006928,0.008457,0.0,0.410804,0.427421,238.5,180.0,8.0,1.208756,16.40606,12.396826,53.077499,4.8,6.655,7.4,5.4,5.155,5.458333,6.1725,6.0,18.425,0.0
10%,2.0,1.988272,1087.5,0.0,0.781379,0.168216,0.012364,0.009094,0.00979,0.0,0.439701,0.446998,269.5,202.0,9.5,1.241224,17.884259,13.3625,57.400002,5.4,7.28,7.9,6.15,5.84,6.0,6.4,6.0,19.9,0.0
20%,2.0,2.232888,1314.0,0.0,0.786875,0.17193,0.015158,0.011138,0.011118,0.0,0.466775,0.467442,322.0,241.0,12.0,1.284289,19.76923,14.709678,61.900002,6.3,8.08,8.7,7.3,6.66,6.857143,6.71,7.0,21.700001,0.0
30%,2.0,2.532599,1482.0,0.0,0.790532,0.174986,0.01681,0.013238,0.012684,0.000778,0.493053,0.479837,364.5,274.0,14.0,1.314428,21.358655,15.666667,65.730003,6.9,8.57,9.3,8.1,7.245,7.666667,6.97,7.0,23.4,0.0
40%,3.0,2.775798,1700.0,0.000835,0.793531,0.177907,0.018519,0.014957,0.01381,0.00146,0.514706,0.492891,420.0,310.0,16.0,1.339181,22.853659,16.727272,68.599998,7.4,9.1,9.7,8.9,7.77,8.333333,7.23,8.0,24.799999,0.0


In [13]:
features.sort()
with open(f"output/features_{version}.json", "w") as f:
    json.dump({"feature_names": features}, f)

In [14]:
df.to_parquet(f"output/features_{version}.parquet", index=False)
assert df.notna().all(axis=None)

In [15]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")

Total time taken 0:01:00.290712
