In [1]:
!pip install --upgrade --no-deps --force-reinstall /kaggle/input/pip-wheels/scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install --upgrade --no-deps --force-reinstall /kaggle/input/pip-wheels/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl

Processing /kaggle/input/pip-wheels/scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.5.0
Processing /kaggle/input/pip-wheels/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1


In [2]:
from pathlib import Path
import os
import sys
import gc
import re
import shutil
import json
import math
from collections import defaultdict
import jinja2
import numpy as np
import pandas as pd
import sklearn
import bitsandbytes
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any
print(f"scikit-learn=={sklearn.__version__}, bitsandbytes=={bitsandbytes.__version__}")

scikit-learn==1.5.0, bitsandbytes==0.43.1


In [3]:
class ModelConf(NamedTuple):
    name: str
    directory: Path
    model_max_length: int = 512
    batch_size: int = 16
    char_limit: int = 1000
    load_in_dtype: str = "auto"
        

class Conf(NamedTuple):
    debug: bool = False  
    input_dir: Path = Path("/kaggle/input")
    comp_dir: Path = input_dir / "learning-agency-lab-automated-essay-scoring-2"
    temp_dir: Path = Path('/kaggle/temp')
    # write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
    working_dir: Path = Path('/kaggle/working')
    resource_dir: Path = input_dir / "lib-lalaes2/lalaes2-0.1"
    data_dir: Path = resource_dir / "input"
    template_version: int = 3
    llm_key: str = "gemma"
    llm_map: Dict[str, ModelConf] = {
        "qwen2": ModelConf(
            name="qwen2",
            directory = input_dir / "qwen2/transformers/qwen2-1.5b-instruct/1",
            model_max_length=8192,
            char_limit=4000,
            load_in_dtype="fp16",  # required for older gpus like P100,T4
        ),
        "gemma": ModelConf(
            name="gemma",
            directory = input_dir / "gemma/transformers/1.1-2b-it/1",
            model_max_length=8192,
            char_limit=4000,
            load_in_dtype="fp16",
        ),
        "phi2": ModelConf(
            name="phi2",
            directory = input_dir / "phi/transformers/2/1",
            model_max_length=2048,
            char_limit=4000,
            load_in_dtype="fp16",
        ),
        "mistral7b": ModelConf(
            name="mistral7b",
            directory = input_dir / "mistral/pytorch/7b-instruct-v0.1-hf/1",
            model_max_length=4096,
            char_limit=1000,
        ),
        "mixtral8x7b": ModelConf(
            name="mixtral8x7b",
            directory = input_dir / "mixtral/pytorch/8x7b-instruct-v0.1-hf/1",
            model_max_length=4096,
            char_limit=1000,
        ),
        "mixtral7b02": ModelConf(
            name="mixtral7b02",
            directory = input_dir / "mistral-7b-instruct-v02-fp16",
            model_max_length=4096,
            char_limit=1000,
        ),
        "llama38b": ModelConf(
            name="llama38b",
            directory = input_dir / "llama-3/transformers/8b-hf/1",
            model_max_length=8192,
            char_limit=1000,
        ),
        "llama38bchat": ModelConf(
            name="llama38bchat",
            directory = input_dir / "llama-3/transformers/8b-chat-hf/1",
            model_max_length=8192,
            char_limit=1000,
        ),
    }
    models: List[ModelConf] = [
        ModelConf(
            name="deberta",
            directory=resource_dir / "models/aes2/deberta_v3_large/20240629_143241",
            model_max_length=1280,
            batch_size=16,
        ),
    ]
    thresholds: List[float] = [
        1.668159504388556,
        2.309778795006449,
        3.002342870933531,
        4.3425596543567,
        6.027380459922358
    ]

        
conf = Conf()
print(conf)
environment = jinja2.Environment()
mc = conf.llm_map[conf.llm_key]

Conf(debug=False, input_dir=PosixPath('/kaggle/input'), comp_dir=PosixPath('/kaggle/input/learning-agency-lab-automated-essay-scoring-2'), temp_dir=PosixPath('/kaggle/temp'), working_dir=PosixPath('/kaggle/working'), resource_dir=PosixPath('/kaggle/input/lib-lalaes2/lalaes2-0.1'), data_dir=PosixPath('/kaggle/input/lib-lalaes2/lalaes2-0.1/input'), template_version=3, llm_key='gemma', llm_map={'qwen2': ModelConf(name='qwen2', directory=PosixPath('/kaggle/input/qwen2/transformers/qwen2-1.5b-instruct/1'), model_max_length=8192, batch_size=16, char_limit=4000, load_in_dtype='fp16'), 'gemma': ModelConf(name='gemma', directory=PosixPath('/kaggle/input/gemma/transformers/1.1-2b-it/1'), model_max_length=8192, batch_size=16, char_limit=4000, load_in_dtype='fp16'), 'phi2': ModelConf(name='phi2', directory=PosixPath('/kaggle/input/phi/transformers/2/1'), model_max_length=2048, batch_size=16, char_limit=4000, load_in_dtype='fp16'), 'mistral7b': ModelConf(name='mistral7b', directory=PosixPath('/kagg

In [4]:
v1_template = environment.from_string(
"""You are a teacher grading a student's essay. Assign a score on a scale of 1 to 6 based on the following rubric. A score of 6 indicates clear and consistent mastery with minor errors, showcasing insightful development of a viewpoint, outstanding critical thinking, strong use of appropriate examples and evidence, well-organized structure with coherence and smooth progression of ideas, skillful language use with varied vocabulary and sentence structure, and minimal errors in grammar, usage, and mechanics. Scores decrease with occasional errors (score of 5), lapses in quality (score of 4), weaknesses in critical thinking and organization (score of 3), serious flaws in viewpoint and coherence (score of 2), and fundamental errors hindering meaning (score of 1).
In your response, output only the score without explanation.
[ESSAY] {{ essay }} [/ESSAY]
"""
)
v2_template = environment.from_string(
"""You are a teacher grading a student's essay. Assign a score on a scale of 1 to 6 based on the following rubric. A score of 6 indicates clear and consistent mastery with minor errors, showcasing insightful development of a viewpoint, outstanding critical thinking, strong use of appropriate examples and evidence, well-organized structure with coherence and smooth progression of ideas, skillful language use with varied vocabulary and sentence structure, and minimal errors in grammar, usage, and mechanics. Scores decrease with occasional errors (score of 5), lapses in quality (score of 4), weaknesses in critical thinking and organization (score of 3), serious flaws in viewpoint and coherence (score of 2), and fundamental errors hindering meaning (score of 1).
Provide constructive feedback to improve the essay.
[ESSAY] {{ essay }} [/ESSAY]
"""
)
v3_template = environment.from_string(
"""You are a teacher grading a student's essay. Assign a score on a scale of 1 to 6 based on the following rubric. A score of 6 indicates clear and consistent mastery with minor errors, showcasing insightful development of a viewpoint, outstanding critical thinking, strong use of appropriate examples and evidence, well-organized structure with coherence and smooth progression of ideas, skillful language use with varied vocabulary and sentence structure, and minimal errors in grammar, usage, and mechanics. Scores decrease with occasional errors (score of 5), lapses in quality (score of 4), weaknesses in critical thinking and organization (score of 3), serious flaws in viewpoint and coherence (score of 2), and fundamental errors hindering meaning (score of 1).
In your response, state the score and areas for improvement.
[ESSAY] {{ essay }} [/ESSAY]
"""
)
template = v1_template
max_new_tokens = 16
if conf.template_version==2:
    template = v2_template
    max_new_tokens = 64
if conf.template_version==3:
    template = v3_template
    max_new_tokens = 128

In [5]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, Tesla P100-PCIE-16GB
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [6]:
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
sys.path.append(str(conf.input_dir / "sgcharts-ml/src"))
sys.path.append(str(conf.resource_dir / "src"))
import scml
from scml import nlp as snlp
from scml import pandasx as pdx
import lalaes2 as mylib
from warnings import simplefilter 
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
scml.seed_everything()

In [7]:
df = pd.read_csv(conf.comp_dir / "test.csv")
basic = mylib.BasicPreprocessor()


def preprocess_text(fn, col) -> Callable:
    def inner(row) -> str:
        return fn(row[col])
    
    return inner


df["full_text"] = df.progress_apply(preprocess_text(basic, "full_text"), axis=1)
df.info()

100%|██████████| 3/3 [00:00<00:00, 395.74it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   3 non-null      object
 1   full_text  3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes





# LLM generates critique

In [8]:
tokenizer = AutoTokenizer.from_pretrained(mc.directory, model_max_length=mc.model_max_length)
tokenizer.pad_token = tokenizer.eos_token
print(f"""{tokenizer}
model_input_names={tokenizer.model_input_names}
pad_token_id={tokenizer.pad_token_id}
""")

GemmaTokenizerFast(name_or_path='/kaggle/input/gemma/transformers/1.1-2b-it/1', vocab_size=256000, model_max_length=8192, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<eos>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_wo

In [9]:
%%time
# https://huggingface.co/blog/4bit-transformers-bitsandbytes
# https://huggingface.co/docs/transformers/v4.38.1/en/quantization#compute-data-type
# from qlora paper:
# 4-bit NormalFloat (NF4), a new data type that is information theoretically optimal for normally distributed weights
if mc.load_in_dtype=="q4":
    print("load 4bit")
    model = ModelForCausalLM.from_pretrained(
        mc.directory,
        device_map="auto",
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,  # Uses a second quantization after the first one to save an additional 0.4 bits per parameter
        )
    )
elif mc.load_in_dtype=="q8":
    print("load 8bit")
    model = ModelForCausalLM.from_pretrained(
        mc.directory,
        device_map="auto",
        quantization_config=BitsAndBytesConfig(
            load_in_8bit=True,
        )
    )
elif mc.load_in_dtype=="fp16":
    print("load fp16")
    model = AutoModelForCausalLM.from_pretrained(
        mc.directory,
        device_map="auto",
        torch_dtype=torch.float16,
        revision="float16",
    )
else:
    print("load auto")
    model = AutoModelForCausalLM.from_pretrained(
        mc.directory,
        torch_dtype="auto",
        device_map="auto",
    )

load fp16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 6.07 s, sys: 4.67 s, total: 10.7 s
Wall time: 34.5 s


In [10]:
print(model)
print(model.config)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

In [11]:
def critique(row) -> str:
    """Inference method and response extraction based on https://huggingface.co/Qwen/Qwen2-7B-Instruct"""
    prompt = template.render(essay=str(row["full_text"])[:mc.char_limit])
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer(text, truncation=True, return_tensors="pt").to(device)
    model_outputs = model.generate(
        **model_inputs, 
        max_new_tokens=max_new_tokens,
        do_sample=False,
        #temperature=3.0,
        #top_p=0.95,
        #top_k=100,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
    )
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, model_outputs)]
    res = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    res = basic(res)
    if len(res)==1 and res.isdigit():
        res = f"SCORE OF {res}"
    return res


df["critique"] = df.progress_apply(critique, axis=1)

100%|██████████| 3/3 [00:11<00:00,  3.70s/it]


In [12]:
df["critique"].head().tolist()

['**Score: 5** **Areas for Improvement:** * **Critical Thinking:** Lacks deeper analysis and critique of the text. * **Organization:** Paragraphs lack smooth transitions and logical flow. * **Vocabulary:** Limited and repetitive use of words. * **Grammar and Usage:** Minor grammatical errors and typos. **Strengths:** * Clear and concise writing style * Relevant and informative content * Emphasis on sustainable practices',
 '**Score: 5** **Areas for Improvement:** * **Critical Thinking and Organization:** Lacks deeper analysis and justification of claims. * **Language Use:** Some sentences are repetitive or lack variety. * **Grammar and Mechanics:** Minor grammatical errors and typos detract from clarity. **Strengths:** * Clear and concise introduction that states the main point early on. * Provides specific examples and evidence to support arguments. * Uses relatable examples from Earth to illustrate points.',
 "**Score: 5** **Areas for Improvement:** * **Critical Thinking:** Lacks dee

In [13]:
del model
torch.cuda.empty_cache()
gc.collect()

21

# Encoder Model Inference

In [14]:
mc = conf.models[0]
tokenizer = AutoTokenizer.from_pretrained(mc.directory, model_max_length=mc.model_max_length)
model = AutoModelForSequenceClassification.from_pretrained(mc.directory)
logits = mylib.predict_holistic_score(
    ds=mylib.Aes2Dataset(
        tokenizer=tokenizer,
        critiques=df["critique"].tolist(),
        texts=df["full_text"].tolist(),
    ),
    model=model,
    batch_size=mc.batch_size,
    device=device,
)
print(f"logits {logits.shape}")



logits (3,)


In [15]:
y_pred = pd.cut(
    x=logits,
    bins=[-np.inf] + conf.thresholds + [np.inf], 
    labels=mylib.Aes2Dataset.HOLISTIC_SCORE_LABELS,
)
df["score"] = y_pred.astype(np.int8)
cols = ["essay_id", "score"]
sub = df[cols]
sub.to_csv("submission.csv", index=False)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   essay_id  3 non-null      object
 1   score     3 non-null      int8  
dtypes: int8(1), object(1)
memory usage: 155.0+ bytes


In [16]:
sub.head()

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,4


# Debug

In [17]:
#!pip list

In [18]:
# Code for running LLM batch inference
#prompts = [template.render(essay=text[:conf.char_limit]) for text in df["full_text"]]
#out = []
#i=0
#bsz=conf.llm_model.batch_size
#while i<len(prompts):
#    inputs = tokenizer(
#        prompts[i:i+bsz],
#        truncation=True,
#        padding="max_length",
#        return_tensors="pt",
#    ).to(device)
#    outputs = llm.generate(
#        **inputs, 
#        max_new_tokens=256,
#        do_sample=True,
#        temperature=1.0,
#        top_p=0.95,
#        top_k=40,
#        repetition_penalty=1.1,
#        pad_token_id=tokenizer.eos_token_id,
#    )
#    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
#    for generated_text in generated_texts:
#        parts = generated_text.split("[/INST]")
#        if len(parts)==2:
#            generated_text = parts[1]
#            generated_text = basic(generated_text)
#        else:
#            generated_text = "None"
#        out.append(generated_text)
#    i+=bsz
#df["prompt"] = out