In [1]:
from pathlib import Path
import os
import sys
import gc
import re
import shutil
import json
import math
import jinja2
from collections import defaultdict
import numpy as np
import pandas as pd
import bitsandbytes
import accelerate
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm import tqdm
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, NamedTuple, Callable, Iterable, Set, Optional, Any
import scml
import lalaes2 as mylib
print(f"accelerate={accelerate.__version__}, bitsandbytes={bitsandbytes.__version__}")

accelerate=0.30.1, bitsandbytes=0.43.1


In [2]:
version = "02"
corpus_key = "persuade"
corpus_map = {
    "comp": Path("input/train.csv"),
    "persuade": Path("input/persuade20/persuade_2.0_human_scores_demo_id_github.csv"),
}
critique_enable = True
critique_col = "critique_gemma_1_1_2b_it"
critique_model_dir = Path("huggingface/google/gemma-1.1-2b-it")
critique_model_max_length = 8192
environment = jinja2.Environment()
basic = mylib.BasicPreprocessor()
bow = mylib.BowPreprocessor(drop_stopword=True)

In [3]:
tim = scml.Timer()
tim.start()
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()

In [4]:
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    for i in range(torch.cuda.device_count()):
        print(f"device={i}, {torch.cuda.get_device_name(i)}")
        print('Mem Allocated:', round(torch.cuda.memory_allocated(i)/1024**3,1), 'GB')
        print('Mem Cached:   ', round(torch.cuda.memory_reserved(i)/1024**3,1), 'GB')
else:
    print("cpu")

device=0, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB
device=1, NVIDIA GeForce RTX 4070 Ti SUPER
Mem Allocated: 0.0 GB
Mem Cached:    0.0 GB


In [5]:
df = pd.read_csv(corpus_map[corpus_key], low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25996 entries, 0 to 25995
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   essay_id_comp               25996 non-null  object 
 1   full_text                   25996 non-null  object 
 2   holistic_essay_score        25996 non-null  int64  
 3   word_count                  25996 non-null  int64  
 4   prompt_name                 25996 non-null  object 
 5   task                        25996 non-null  object 
 6   assignment                  25996 non-null  object 
 7   source_text                 12875 non-null  object 
 8   gender                      25996 non-null  object 
 9   grade_level                 24828 non-null  float64
 10  ell_status                  24787 non-null  object 
 11  race_ethnicity              25996 non-null  object 
 12  economically_disadvantaged  20759 non-null  object 
 13  student_disability_status   208

In [6]:
if corpus_key=="persuade":
    df = df.rename(columns={
        "essay_id_comp": "essay_id",
        "holistic_essay_score": "score",
        "prompt_name": "topic",
        "assignment": "prompt",
    })
    df = df.sample(frac=0.5)
cols = ["score"]
df[cols] = df[cols].astype(np.int8)
cols = ["essay_id", "score", "topic", "full_text"]
df = df[cols].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12998 entries, 379 to 9765
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   12998 non-null  object
 1   score      12998 non-null  int8  
 2   topic      12998 non-null  object
 3   full_text  12998 non-null  object
dtypes: int8(1), object(3)
memory usage: 418.9+ KB


In [7]:
def preprocess_text(fn, col) -> Callable:
    def inner(row) -> str:
        return fn(row[col])
    
    return inner


cols = ["full_text"]
for col in cols:
    print(col)
    df[col] = df.progress_apply(preprocess_text(basic, col), axis=1)
cols = ["topic"]
for col in cols:
    print(col)
    df[col] = df.progress_apply(preprocess_text(bow, col), axis=1)

full_text


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12998/12998 [00:04<00:00, 2907.47it/s]


topic


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12998/12998 [00:06<00:00, 1926.82it/s]


# LLM-as-a-Judge

LLM takes on the role of a teacher to generate essay critique.

In [8]:
if critique_enable:
    tokenizer = AutoTokenizer.from_pretrained(critique_model_dir, model_max_length=critique_model_max_length)
    tokenizer.pad_token = tokenizer.eos_token
    #print(f"{repr(tokenizer)}\nmodel_input_names={tokenizer.model_input_names}")
    print(f"pad_token_id={tokenizer.pad_token_id}")

pad_token_id=1


In [9]:
if critique_enable:
    model = AutoModelForCausalLM.from_pretrained(
        critique_model_dir,
        device_map="auto",
        torch_dtype=torch.float16,
        revision="float16",
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
print(model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

In [11]:
template = environment.from_string(
"""[INST] You are a teacher grading a student's essay. Assign a score between 1 (minimum) and 6 (maximum) based on the rubric below. Provide constructive feedback to improve the essay.
SCORE OF 6: An essay in this category demonstrates clear and consistent mastery, although it may have a few minor errors. A typical essay effectively and insightfully develops a point of view on the issue anddemonstrates outstanding critical thinking; the essay uses clearly appropriate examples, reasons, and other evidence taken from the source text(s) to support its position; the essay is well organized and clearly focused, demonstrating clear coherence and smooth progression of ideas; the essay exhibits skillful use of language, using a varied, accurate, and apt vocabulary and demonstrates meaningful variety in sentence structure; the essay is free of most errors in grammar, usage, and mechanics.
SCORE OF 5: An essay in this category demonstrates reasonably consistent mastery, although it will have occasional errors or lapses in quality. A typical essay effectively develops a point of view on the issue and demonstrates strong critical thinking; the essay generally using appropriate examples, reasons, and other evidence taken from the source text(s) to support its position; the essay is well organized and focused,demonstrating coherence and progression of ideas; the essay exhibits facility in the use of language, using appropriate vocabulary demonstrates variety in sentence structure; the essay is generally free of most errors in grammar, usage, and mechanics. 
SCORE OF 4: An essay in this category demonstrates adequate mastery, although it will have lapses in quality. A typical essay develops a point of view on the issue and demonstrates competent critical thinking; the essay using adequate examples, reasons, and other evidence taken from the source text(s) to support its position; the essay is generally organized and focused, demonstrating some coherence and progression of ideas exhibits adequate; the essay may demonstrate inconsistent facility in the use of language, using generally appropriate vocabulary demonstrates some variety in sentence structure; the essay may have some errors in grammar, usage, and mechanics.
SCORE OF 3: An essay in this category demonstrates developing mastery, and is marked by ONE OR MORE of the following weaknesses: develops a point of view on the issue, demonstrating some critical thinking, but may do so inconsistently or use inadequate examples, reasons, or other evidence taken from the source texts to support its position; the essay is limited in its organization or focus, or may demonstrate some lapses in coherence or progression of ideas displays; the essay may demonstrate facility in the use of language, but sometimes uses weak vocabulary or inappropriate word choice and/or lacks variety or demonstrates problems in sentence structure; the essay may contain an accumulation of errors in grammar, usage, and mechanics.
SCORE OF 2: An essay in this category demonstrates little mastery, and is flawed by ONE OR MORE of the following weaknesses: develops a point of view on the issue that is vague or seriously limited, and demonstrates weak critical thinking; the essay provides inappropriate or insufficient examples, reasons, or other evidence taken from the source text to support its position; the essay is poorly organized and/or focused, or demonstrates serious problems with coherence or progression of ideas; the essay displays very little facility in the use of language, using very limited vocabulary or incorrect word choice and/or demonstrates frequent problems in sentence structure; the essay contains errors in grammar, usage, and mechanics so serious that meaning is somewhat obscured.
SCORE OF 1: An essay in this category demonstrates very little or no mastery, and is severely flawed by ONE OR MORE of the following weaknesses: develops no viable point of view on the issue, or provides little or no evidence to support its position; the essay is disorganized or unfocused, resulting in a disjointed or incoherent essay; the essay displays fundamental errors in vocabulary and/or demonstrates severe flaws in sentence structure; the essay contains pervasive errors in grammar, usage, or mechanics that persistently interfere with meaning.
[ESSAY] {{ essay }} [/ESSAY]
[/INST]"""
)

In [12]:
def critique(row) -> str:
    text = str(row["full_text"]).strip()
    prompt = template.render(essay=text)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs, 
        max_new_tokens=256,
        do_sample=True,
        temperature=1.0,
        top_p=0.95,
        top_k=40,
        repetition_penalty=1.1,
        pad_token_id=tokenizer.eos_token_id,
    )
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    generated_text = generated_text.split("[/INST]")[1]
    generated_text = basic(generated_text)
    if len(generated_text)==0:
        generated_text = "None"
    return generated_text


df[critique_col] = df.progress_apply(critique, axis=1)

 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 10137/12998 [10:01:24<2:49:44,  3.56s/it]


KeyboardInterrupt: 

In [None]:
if critique_enable:
    print(df[critique_col].head().tolist())

In [None]:
#if critique_enable:
#    del model
#    torch.cuda.empty_cache()
#    gc.collect()

# Review Data

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.to_parquet(f"output/{corpus_key}_{version}.parquet", index=False)
assert df.notna().all(axis=None)

In [None]:
tim.stop()
print(f"Total time taken {str(tim.elapsed)}")