In [1]:
!pip install --upgrade --no-deps --force-reinstall /kaggle/input/pip-wheels/scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --upgrade --no-deps --force-reinstall /kaggle/input/pip-wheels/pyphen-0.15.0-py3-none-any.whl
!pip install --upgrade --no-deps --force-reinstall /kaggle/input/pip-wheels/textstat-0.7.3-py3-none-any.whl

Processing /kaggle/input/pip-wheels/scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.5.0
Processing /kaggle/input/pip-wheels/pyphen-0.15.0-py3-none-any.whl
Installing collected packages: pyphen
Successfully installed pyphen-0.15.0
Processing /kaggle/input/pip-wheels/textstat-0.7.3-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.3


In [2]:
from pathlib import Path
import os
import sys
import gc
import shutil
import json
import math
from collections import defaultdict
import numpy as np
import pandas as pd
import lightgbm as lgb
import sklearn
import textstat
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any
print(f"scikit-learn=={sklearn.__version__}, textstat=={textstat.__version__}")

scikit-learn==1.5.0, textstat==(0, 7, 2)


In [3]:
class ModelConf(NamedTuple):
    name: str
    directory: Path
    model_max_length: int
    batch_size: int
        

class Conf(NamedTuple):
    debug: bool = False  
    input_dir: Path = Path("/kaggle/input")
    comp_dir: Path = input_dir / "learning-agency-lab-automated-essay-scoring-2"
    temp_dir: Path = Path('/kaggle/temp')
    # write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
    working_dir: Path = Path('/kaggle/working')
    resource_dir: Path = input_dir / "lib-lalaes2/lalaes2-0.1"
    data_dir: Path = resource_dir / "input"
    lgb_model_file: Path = resource_dir / "models/lgb/20240617_121519/lgb.txt" 
    base_models: List[ModelConf] = [
        ModelConf(
            name="deberta_base",
            directory=resource_dir / "models/aes2/deberta_v3_base/20240615_063400",
            model_max_length=512,
            batch_size=32,
        ),
    ]
    thresholds: List[float] = [1.5679287510412498, 2.5150679100687494, 3.4973218989187504, 4.4458222753237475, 5.526982087151248]

conf = Conf()
print(conf)

Conf(debug=False, input_dir=PosixPath('/kaggle/input'), comp_dir=PosixPath('/kaggle/input/learning-agency-lab-automated-essay-scoring-2'), temp_dir=PosixPath('/kaggle/temp'), working_dir=PosixPath('/kaggle/working'), resource_dir=PosixPath('/kaggle/input/lib-lalaes2/lalaes2-0.1'), data_dir=PosixPath('/kaggle/input/lib-lalaes2/lalaes2-0.1/input'), lgb_model_file=PosixPath('/kaggle/input/lib-lalaes2/lalaes2-0.1/models/lgb/20240617_121519/lgb.txt'), base_models=[ModelConf(name='deberta_base', directory=PosixPath('/kaggle/input/lib-lalaes2/lalaes2-0.1/models/aes2/deberta_v3_base/20240615_063400'), model_max_length=512, batch_size=32)], thresholds=[1.5679287510412498, 2.5150679100687494, 3.4973218989187504, 4.4458222753237475, 5.526982087151248])


In [4]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

cpu


In [5]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
sys.path.append(str(conf.input_dir / "sgcharts-ml/src"))
sys.path.append(str(conf.resource_dir / "src"))
import scml
from scml import nlp as snlp
from scml import pandasx as pdx
import lalaes2 as mylib
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
scml.seed_everything()

# Preprocess Text

In [6]:
df = pd.read_csv(conf.comp_dir / "test.csv")
basic = mylib.BasicPreprocessor()


def preprocess_text(fn, col) -> Callable:
    def inner(row) -> str:
        return fn(row[col])
    
    return inner


text_col = "clean_text"
df[text_col] = df.progress_apply(preprocess_text(basic, "full_text"), axis=1)
df.info()

100%|██████████| 3/3 [00:00<00:00, 281.48it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   essay_id    3 non-null      object
 1   full_text   3 non-null      object
 2   clean_text  3 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes





# Feature Engineering

In [7]:
features = []

# Base Model Inference

In [8]:
for mc in conf.base_models:
    print(mc.name)
    df[mc.name] = mylib.predict_holistic_score(
        ds=mylib.Aes2Dataset(
            tokenizer=AutoTokenizer.from_pretrained(mc.directory, model_max_length=mc.model_max_length),
            texts=df[text_col].tolist(),
        ),
        model=AutoModelForSequenceClassification.from_pretrained(mc.directory),
        batch_size=mc.batch_size,
        device=device,
        progress_bar=False,
    )
    features.append(mc.name)
df[mc.name].head()

deberta_base




0    2.350891
1    2.673539
2    4.360243
Name: deberta_base, dtype: float32

# Character & Word-level Features

In [9]:
#%%time
#col = "cw_len"
#df[col] = df[text_col].str.len()
#features.append(col)

In [10]:
def digit_frac(row) -> float:
    return mylib.digit_frac(row[text_col])


def letter_frac(row) -> float:
    return mylib.letter_frac(row[text_col])


def space_frac(row) -> float:
    return mylib.space_frac(row[text_col])


def punc_frac(row) -> float:
    return mylib.punc_frac(row[text_col])


def upper_frac(row) -> float:
    return mylib.upper_frac(row[text_col])


def repeat_char_frac(row) -> float:
    return mylib.repeat_char_frac(row[text_col])


def repeat_substring_frac(row) -> float:
    return mylib.repeat_substring_frac(row[text_col])


def unique_word_frac(row) -> float:
    return mylib.unique_word_frac(row[text_col])


sf = mylib.StopwordFraction()


def stopword_frac(row) -> float:
    return sf(row[text_col])



cw_fns: List[Tuple[str, Callable]] = [
    #("cw_digit_frac", digit_frac),
    #("cw_letter_frac", letter_frac),
    #("cw_space_frac", space_frac),
    #("cw_punc_frac", punc_frac),
    #("cw_upper_frac", upper_frac),
    #("cw_repeat_char_frac", repeat_char_frac),
    #("cw_repeat_substring_frac", repeat_substring_frac),
    #("cw_unique_word_frac", unique_word_frac),
    ("cw_stopword_frac", stopword_frac),
]   
for col, fn in cw_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    features.append(col)
df[features] = df[features].astype(np.float32)

cw_stopword_frac


100%|██████████| 3/3 [00:00<00:00, 1586.55it/s]


# Textstat Features

In [11]:
def syllable_count(row) -> int:
    return textstat.syllable_count(row[text_col])


def lexicon_count(row) -> int:
    return textstat.lexicon_count(row[text_col])


def sentence_count(row) -> int:
    return textstat.sentence_count(row[text_col])


def syllables_per_word(row) -> float:
    return row["ts_syllable_count"] / (row["ts_lexicon_count"] + 1)


def syllables_per_sent(row) -> float:
    return row["ts_syllable_count"] / (row["ts_sentence_count"] + 1)


def words_per_sent(row) -> float:
    return row["ts_lexicon_count"] / (row["ts_sentence_count"] + 1)


def flesch_reading_ease(row) -> float:
    return textstat.flesch_reading_ease(row[text_col])


def flesch_kincaid_grade(row) -> float:
    return textstat.flesch_kincaid_grade(row[text_col])


def gunning_fog(row) -> float:
    return textstat.gunning_fog(row[text_col])


def smog_index(row) -> float:
    return textstat.smog_index(row[text_col])


def automated_readability_index(row) -> float:
    return textstat.automated_readability_index(row[text_col])


def coleman_liau_index(row) -> float:
    return textstat.coleman_liau_index(row[text_col])


def linsear_write_formula(row) -> float:
    return textstat.linsear_write_formula(row[text_col])


def dale_chall_readability_score(row) -> float:
    return textstat.dale_chall_readability_score(row[text_col])


def text_standard(row) -> float:
    return textstat.text_standard(row[text_col], float_output=True)


def mcalpine_eflaw(row) -> float:
    return textstat.mcalpine_eflaw(row[text_col])


textstat_fns: List[Tuple[str, Callable]] = [
    ("ts_syllable_count", syllable_count),
    ("ts_lexicon_count", lexicon_count),
    ("ts_sentence_count", sentence_count),
    ("ts_syllables_per_word", syllables_per_word),
    ("ts_syllables_per_sent", syllables_per_sent),
    ("ts_words_per_sent", words_per_sent),
    ("ts_flesch_reading_ease", flesch_reading_ease),
    ("ts_flesch_kincaid_grade", flesch_kincaid_grade),
    ("ts_gunning_fog", gunning_fog),
    ("ts_smog_index", smog_index),
    ("ts_automated_readability_index", automated_readability_index),
    ("ts_coleman_liau_index", coleman_liau_index),
    ("ts_linsear_write_formula", linsear_write_formula),
    ("ts_dale_chall_readability_score", dale_chall_readability_score),
    ("ts_text_standard", text_standard),
    ("ts_mcalpine_eflaw", mcalpine_eflaw),
]    
for col, fn in textstat_fns:
    print(col)
    df[col] = df.progress_apply(fn, axis=1)
    if col.endswith("_count"):
        continue
    features.append(col)
df[features] = df[features].astype(np.float32)

ts_syllable_count


100%|██████████| 3/3 [00:00<00:00, 118.43it/s]


ts_lexicon_count


100%|██████████| 3/3 [00:00<00:00, 1818.34it/s]


ts_sentence_count


100%|██████████| 3/3 [00:00<00:00, 1205.95it/s]


ts_syllables_per_word


100%|██████████| 3/3 [00:00<00:00, 2642.36it/s]


ts_syllables_per_sent


100%|██████████| 3/3 [00:00<00:00, 2696.15it/s]


ts_words_per_sent


100%|██████████| 3/3 [00:00<00:00, 2259.46it/s]


ts_flesch_reading_ease


100%|██████████| 3/3 [00:00<00:00, 2296.57it/s]


ts_flesch_kincaid_grade


100%|██████████| 3/3 [00:00<00:00, 2225.09it/s]


ts_gunning_fog


100%|██████████| 3/3 [00:00<00:00, 233.28it/s]


ts_smog_index


100%|██████████| 3/3 [00:00<00:00, 402.06it/s]


ts_automated_readability_index


100%|██████████| 3/3 [00:00<00:00, 1180.94it/s]


ts_coleman_liau_index


100%|██████████| 3/3 [00:00<00:00, 948.01it/s]


ts_linsear_write_formula


100%|██████████| 3/3 [00:00<00:00, 924.47it/s]


ts_dale_chall_readability_score


100%|██████████| 3/3 [00:00<00:00, 533.88it/s]


ts_text_standard


100%|██████████| 3/3 [00:00<00:00, 2109.10it/s]


ts_mcalpine_eflaw


100%|██████████| 3/3 [00:00<00:00, 1316.07it/s]


# Final Model Inference

In [12]:
%%time
model = lgb.Booster(model_file=conf.lgb_model_file)
logits = model.predict(df[features])
print(logits[:5])

[1.12307912 1.2214806  1.12307912]
CPU times: user 14.8 ms, sys: 3.58 ms, total: 18.3 ms
Wall time: 47.1 ms


In [13]:
y_pred = pd.cut(
    x=logits, 
    bins=[-np.inf] + conf.thresholds + [np.inf], 
    labels=mylib.Aes2Dataset.HOLISTIC_SCORE_LABELS,
)
df["score"] = y_pred.astype(np.int8)
cols = ["essay_id", "score"]
sub = df[cols]
sub.to_csv("submission.csv", index=False)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   essay_id  3 non-null      object
 1   score     3 non-null      int8  
dtypes: int8(1), object(1)
memory usage: 155.0+ bytes


In [14]:
sub.head()

Unnamed: 0,essay_id,score
0,000d118,1
1,000fe60,1
2,001ab80,1


# Debug

In [15]:
#!pip list