# Imports

In [1]:
import itertools
import os
import sys

import dotenv
import numpy as np
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

from IPython.display import display

dotenv.load_dotenv()
sys.path.append("../../src")
import dataframe_utils
import GiveMeSomeCredit

# Variables

In [2]:
MODEL_ID = "google/flan-t5-small"

DESCRIPTION_COLUMN = 0
CLASSIFICATION_QUESTION_ID = 2

_loaded_models = dict()
_loaded_tokenizers = dict()
_yes_token_ids = dict()
_no_token_ids = dict()

MODEL_URL = f"https://huggingface.co/{MODEL_ID}"
print(f"Model URL: {MODEL_URL}")

API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"
print(f"API URL: {API_URL}")

HF_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
if HF_TOKEN is None:
    raise RuntimeError("Please set HUGGINGFACEHUB_API_TOKEN in your environment variables.")
print(f"Huggingface Token: {HF_TOKEN[:3]}...{HF_TOKEN[-4:]}")

Model URL: https://huggingface.co/google/flan-t5-small
API URL: https://api-inference.huggingface.co/models/google/flan-t5-small
Huggingface Token: hf_...YjCe


# Load the Dataset

This section loads the necessary data into DataFrames and displays basic information.


## Load Data Descriptions

In [3]:
descriptions_df = GiveMeSomeCredit.load_data_descriptions()

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        dataframe_utils.describe_df(descriptions_df)
    )

with pd.option_context("display.max_colwidth", None):
    display(
        descriptions_df.head(5).style.set_properties(**{"text-align": "left"})
    )

Unnamed: 0,dtype,count,non_null,null_count,unique,top,freq
Detailed Description,object,150000,150000,0,149326,The individual is 22 years old with a monthly ...,12


Unnamed: 0_level_0,Detailed Description
Row ID,Unnamed: 1_level_1
1,"The individual is 45 years old with a monthly income of $9,120 and supports 2 dependents. Their revolving utilization of unsecured lines is approximately 76.6%. Their debt ratio stands at about 80.3%. They have a total of 13 open credit lines and loans, including 6 real estate loans or lines. Over the past period, they have had 2 instances of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."
2,"The individual is 40 years old with a monthly income of $2,600 and supports 1 dependent. Their revolving utilization of unsecured lines is approximately 95.7%. Their debt ratio stands at about 12.2%. They have a total of 4 open credit lines and loans, including 0 real estate loans or lines. Over the past period, they have had 0 instances of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."
3,"The individual is 38 years old with a monthly income of $3,042 and supports 0 dependents. Their revolving utilization of unsecured lines is approximately 65.8%. Their debt ratio stands at about 8.5%. They have a total of 2 open credit lines and loans, including 0 real estate loans or lines. Over the past period, they have had 1 instance of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 1 instance of being 90 or more days late."
4,"The individual is 30 years old with a monthly income of $3,300 and supports 0 dependents. Their revolving utilization of unsecured lines is approximately 23.4%. Their debt ratio stands at about 3.6%. They have a total of 5 open credit lines and loans, including 0 real estate loans or lines. Over the past period, they have had 0 instances of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."
5,"The individual is 49 years old with a monthly income of $63,588 and supports 0 dependents. Their revolving utilization of unsecured lines is approximately 90.7%. Their debt ratio stands at about 2.5%. They have a total of 7 open credit lines and loans, including 1 real estate loans or lines. Over the past period, they have had 1 instance of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."


## Load Classification Questions

In [4]:
questions_df = GiveMeSomeCredit.load_classification_questions()

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        dataframe_utils.describe_df(questions_df)
    )

with pd.option_context("display.max_colwidth", None):
    display(
        questions_df.head(5).style.set_properties(**{"text-align": "left"})
    )

Unnamed: 0,dtype,count,non_null,null_count,unique,top,freq
Classification Question,object,4,4,0,4,Will this individual experience serious delinq...,1


Unnamed: 0_level_0,Classification Question
Classification Question ID,Unnamed: 1_level_1
0,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years?
1,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years? Answer with yes or no only.
2,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years? Answer with no or yes only.
3,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years? Answer with no or yes and explain your reasoning.


## Load LLM Responses

Temporarily load the LLM responses data for display. The variable is deleted afterward to prevent old data from being used—each time the data is needed, it should be reloaded to ensure the latest version is available.

In [5]:
responses_df = GiveMeSomeCredit.load_classification_responses()

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        dataframe_utils.describe_df(responses_df)
    )
with pd.option_context("display.max_colwidth", None):
    display(
        responses_df.head(5).style.set_properties(**{"text-align": "left"})
    )
    
del responses_df

Unnamed: 0,dtype,count,non_null,null_count,unique,top,freq,mean,std,min,25%,50%,75%,max
Row ID,int64,90000,90000,0,,,,74548.148,43327.774348,4.0,36814.25,74261.0,111940.25,149993.0
Model,object,90000,90000,0,1.0,google/flan-t5-small,90000.0,,,,,,,
Description Column,int64,90000,90000,0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Classification Question ID,int64,90000,90000,0,,,,1.0,0.816501,0.0,0.0,1.0,2.0,2.0
Prediction,object,90000,90000,0,3.0,no,61479.0,,,,,,,
Yes Probability,float64,90000,90000,0,,,,0.373138,0.080088,0.169391,0.273927,0.41593,0.435758,0.495583
No Probability,float64,90000,90000,0,,,,0.461125,0.155254,0.136118,0.255221,0.558641,0.577702,0.611146


Unnamed: 0,Row ID,Model,Description Column,Classification Question ID,Prediction,Yes Probability,No Probability
0,4,google/flan-t5-small,0,0,yes,0.248806,0.226628
1,12,google/flan-t5-small,0,0,yes,0.266791,0.23439
2,14,google/flan-t5-small,0,0,yes,0.224461,0.160168
3,24,google/flan-t5-small,0,0,yes,0.26158,0.260417
4,25,google/flan-t5-small,0,0,yes,0.25744,0.243585


# Helper Functions

## `get_llm_classification`

In [6]:
def get_llm_response(prompt, model_id, hf_token=None):
    if model_id not in _loaded_models:
        _loaded_tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id, token=hf_token)
        _loaded_models[model_id] = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=hf_token)
        _loaded_models[model_id].eval()

        tokenizer = _loaded_tokenizers[model_id]
        _yes_token_ids[model_id] = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("yes"))
        _no_token_ids[model_id] = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("no"))

    tokenizer = _loaded_tokenizers[model_id]
    model = _loaded_models[model_id]
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            output_scores=True,
            return_dict_in_generate=True
        )

        # decode generated text
        response = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

        # first token logits (step 0)
        first_step_logits = outputs.scores[0]  # shape: (1, vocab_size)
        probs = torch.nn.functional.softmax(first_step_logits, dim=-1)

        yes_prob = probs[0, _yes_token_ids[model_id]].sum().item()
        no_prob = probs[0, _no_token_ids[model_id]].sum().item()

    return response, yes_prob, no_prob


## `get_batch_llm_classification`

In [7]:
# def get_batch_llm_responses(prompts, model_id, token):
#     results = [
#         get_llm_response(prompt, model_id=model_id, token=token)
#         for prompt in prompts
#     ]
#     responses = [result[0] for result in results]
#     yes_probs = [result[1] for result in results]
#     no_probs = [result[2] for result in results]
#     return responses, yes_probs, no_probs

def get_batch_llm_responses(prompts, model_id, hf_token=None):
    if model_id not in _loaded_models:
        _loaded_tokenizers[model_id] = AutoTokenizer.from_pretrained(model_id, token=hf_token)
        _loaded_models[model_id] = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=hf_token)
        _loaded_models[model_id].eval()

        tokenizer = _loaded_tokenizers[model_id]
        _yes_token_ids[model_id] = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("yes"))
        _no_token_ids[model_id] = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("no"))

    tokenizer = _loaded_tokenizers[model_id]
    model = _loaded_models[model_id]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            output_scores=True,
            return_dict_in_generate=True
        )

        # decode text responses
        responses = [tokenizer.decode(seq, skip_special_tokens=True) for seq in outputs.sequences]

        # extract probabilities from the first decoding step
        first_step_logits = outputs.scores[0]  # shape: (batch, vocab_size)
        probs = torch.nn.functional.softmax(first_step_logits, dim=-1)
        yes_probs = probs[:, _yes_token_ids[model_id]].sum(dim=1).cpu().tolist()
        no_probs = probs[:, _no_token_ids[model_id]].sum(dim=1).cpu().tolist()

    return responses, yes_probs, no_probs

## `chunked`

In [8]:
def chunked(iterable, n):
    it = iter(iterable)
    while True:
        chunk = list(itertools.islice(it, n))
        if not chunk:
            break
        yield chunk

## `classify`

In [9]:
def classify(
    descriptions_df, questions_df, responses_df,
    validation_ids, model_id, description_index, question_index,
    hf_token, batch_size=20
):
    description_column = descriptions_df.columns[description_index]
    question = questions_df.loc[question_index,"Classification Question"]
    print(f"Question: {question}")
    print()
    
    filtered_df = responses_df[
        (responses_df["Model"] == model_id)
        & (responses_df["Description Column"] == description_index)
        & (responses_df["Classification Question ID"] == question_index)
    ]
    
    missing_indices = sorted(set(validation_ids) - set(filtered_df["Row ID"]))
    
    total = len(missing_indices)
    n_batches = int(np.ceil(total / batch_size))
    print(f"{total} samples")
    print(f"{n_batches} batches of size {batch_size}")
    print("batch ", end=" ")
    for ii,batch_indices in enumerate(chunked(missing_indices, batch_size)):
        print(ii, end=" ")
        batch_df = descriptions_df.loc[batch_indices]
        prompt_series = batch_df[description_column] + " " + question
        prompts = prompt_series.tolist()
        responses, yes_probs, no_probs = get_batch_llm_responses(
            prompts, model_id=model_id, hf_token=hf_token
        )
        
        n_samples = len(batch_indices)
        results_df = pd.DataFrame({
            "Row ID": batch_indices,
            "Model": [model_id]*n_samples,
            "Description Column": [description_index]*n_samples,
            "Classification Question ID": [question_index]*n_samples,
            "Prediction": responses,
            "Yes Probability": yes_probs,
            "No Probability": no_probs
        })
        GiveMeSomeCredit.save_classification_responses(results_df, suppress_logs=True)
        
        import time
        time.sleep(5/2)
    print()
    
    # load the latest data
    responses_df[:] = GiveMeSomeCredit.load_classification_responses()

# Classify Examples

In [10]:
classify(
    descriptions_df=descriptions_df,
    questions_df=questions_df,
    responses_df=GiveMeSomeCredit.load_classification_responses(),
    validation_ids = GiveMeSomeCredit.get_validation_row_ids(),
    model_id=MODEL_ID,
    description_index=DESCRIPTION_COLUMN,
    question_index=CLASSIFICATION_QUESTION_ID,
    hf_token=HF_TOKEN
)

Question: Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years? Answer with no or yes only.

0 samples
0 batches of size 20
batch  


# Check Results

In [11]:
responses_df = GiveMeSomeCredit.load_classification_responses()

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        dataframe_utils.describe_df(responses_df)
    )
with pd.option_context("display.max_colwidth", None):
    display(
        responses_df.head(5).style.set_properties(**{"text-align": "left"}),
        responses_df.tail(5).style.set_properties(**{"text-align": "left"})
    )
    
del responses_df

Unnamed: 0,dtype,count,non_null,null_count,unique,top,freq,mean,std,min,25%,50%,75%,max
Row ID,int64,90000,90000,0,,,,74548.148,43327.774348,4.0,36814.25,74261.0,111940.25,149993.0
Model,object,90000,90000,0,1.0,google/flan-t5-small,90000.0,,,,,,,
Description Column,int64,90000,90000,0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Classification Question ID,int64,90000,90000,0,,,,1.0,0.816501,0.0,0.0,1.0,2.0,2.0
Prediction,object,90000,90000,0,3.0,no,61479.0,,,,,,,
Yes Probability,float64,90000,90000,0,,,,0.373138,0.080088,0.169391,0.273927,0.41593,0.435758,0.495583
No Probability,float64,90000,90000,0,,,,0.461125,0.155254,0.136118,0.255221,0.558641,0.577702,0.611146


Unnamed: 0,Row ID,Model,Description Column,Classification Question ID,Prediction,Yes Probability,No Probability
0,4,google/flan-t5-small,0,0,yes,0.248806,0.226628
1,12,google/flan-t5-small,0,0,yes,0.266791,0.23439
2,14,google/flan-t5-small,0,0,yes,0.224461,0.160168
3,24,google/flan-t5-small,0,0,yes,0.26158,0.260417
4,25,google/flan-t5-small,0,0,yes,0.25744,0.243585


Unnamed: 0,Row ID,Model,Description Column,Classification Question ID,Prediction,Yes Probability,No Probability
89995,149984,google/flan-t5-small,0,2,no,0.414106,0.583897
89996,149985,google/flan-t5-small,0,2,no,0.414839,0.583221
89997,149987,google/flan-t5-small,0,2,no,0.425915,0.57212
89998,149992,google/flan-t5-small,0,2,no,0.422142,0.575839
89999,149993,google/flan-t5-small,0,2,no,0.418228,0.579752
