# 01_llm_preparation

This notebook creates natural language descriptions from the [GiveMeSomeCredit](https://www.kaggle.com/c/GiveMeSomeCredit/rules) dataset. It loads the train dataset, filters the validation ids, applies a structured-to-text mapping, and saves the resulting descriptions to a CSV file for downstream use.


# Imports

In [1]:
import sys

import pandas as pd

from IPython.display import display

sys.path.append("../../src")
import dataframe_utils
import GiveMeSomeCredit

# Load the Dataset

This section loads the validation data into a DataFrame and displays its basic information.


In [2]:
data_df = GiveMeSomeCredit.load_training_data(verbose=True)

/Users/rina/llm-classification/data/GiveMeSomeCredit/raw/cs-training.csv Memory Usage: 13.73 MB


Unnamed: 0,dtype,count,non_null,null_count,mean,std,min,25%,50%,75%,max
SeriousDlqin2yrs,int64,150000,150000,0,0.06684,0.249746,0.0,0.0,0.0,0.0,1.0
RevolvingUtilizationOfUnsecuredLines,float64,150000,150000,0,6.048438,249.755371,0.0,0.029867,0.154181,0.559046,50708.0
age,int64,150000,150000,0,52.295207,14.771866,0.0,41.0,52.0,63.0,109.0
NumberOfTime30-59DaysPastDueNotWorse,int64,150000,150000,0,0.421033,4.192781,0.0,0.0,0.0,0.0,98.0
DebtRatio,float64,150000,150000,0,353.005076,2037.818523,0.0,0.175074,0.366508,0.868254,329664.0
MonthlyIncome,float64,120269,120269,29731,6670.221237,14384.674215,0.0,3400.0,5400.0,8249.0,3008750.0
NumberOfOpenCreditLinesAndLoans,int64,150000,150000,0,8.45276,5.145951,0.0,5.0,8.0,11.0,58.0
NumberOfTimes90DaysLate,int64,150000,150000,0,0.265973,4.169304,0.0,0.0,0.0,0.0,98.0
NumberRealEstateLoansOrLines,int64,150000,150000,0,1.01824,1.129771,0.0,0.0,1.0,2.0,54.0
NumberOfTime60-89DaysPastDueNotWorse,int64,150000,150000,0,0.240387,4.155179,0.0,0.0,0.0,0.0,98.0


Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149996,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149997,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149998,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149999,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


# Generating Natural Language Descriptions

## `get_nl_description`

This function generates a natural language summary of an individual's financial and credit profile based on optional input features. The output is a human-readable paragraph.


In [3]:
def get_nl_description(
    revolving_utilization=None,
    age=None,
    num_30_59_days_past_due=None,
    debt_ratio=None,
    monthly_income=None,
    num_open_credit_lines=None,
    num_90_days_late=None,
    num_real_estate_loans=None,
    num_60_89_days_past_due=None,
    num_dependents=None,
):
    parts = []

    if age is not None:
        age_part = f"The individual is {int(age)} years old"
    else:
        age_part = "The individual’s age is not provided"

    if monthly_income is not None:
        income_part = f"with a monthly income of ${monthly_income:,.0f}"
    else:
        income_part = "with an unspecified monthly income"

    if num_dependents is not None:
        dependents_part = f"and supports {int(num_dependents)} dependent{'s' if num_dependents != 1 else ''}."
    else:
        dependents_part = "and the number of dependents is unknown."

    parts.append(f"{age_part} {income_part} {dependents_part}")

    if revolving_utilization is not None:
        parts.append(f"Their revolving utilization of unsecured lines is approximately {revolving_utilization:.1%}.")

    if debt_ratio is not None:
        parts.append(f"Their debt ratio stands at about {debt_ratio:.1%}.")

    if num_open_credit_lines is not None:
        parts.append(f"They have a total of {int(num_open_credit_lines)} open credit lines and loans")

        if num_real_estate_loans is not None:
            parts[-1] += f", including {int(num_real_estate_loans)} real estate loans or lines."
        else:
            parts[-1] += "."

    payment_parts = []
    if num_30_59_days_past_due is not None:
        payment_parts.append(
            f"{int(num_30_59_days_past_due)} instance{'s' if num_30_59_days_past_due != 1 else ''} of being 30 to 59 days past due"
        )
    if num_60_89_days_past_due is not None:
        payment_parts.append(
            f"{int(num_60_89_days_past_due)} instance{'s' if num_60_89_days_past_due != 1 else ''} of being 60 to 89 days past due"
        )
    if num_90_days_late is not None:
        payment_parts.append(
            f"{int(num_90_days_late)} instance{'s' if num_90_days_late != 1 else ''} of being 90 or more days late"
        )

    if payment_parts:
        parts.append("Over the past period, they have had " + ", ".join(payment_parts) + ".")

    return " ".join(parts)


## `map_row_to_args`

This function converts a pandas DataFrame row into a dictionary of keyword arguments suitable for passing to the `get_nl_description` function. Missing values are converted to `None`.


In [4]:
def map_row_to_args(row):
    columns_to_args = {
        "RevolvingUtilizationOfUnsecuredLines": "revolving_utilization",
        "age": "age",
        "NumberOfTime30-59DaysPastDueNotWorse": "num_30_59_days_past_due",
        "DebtRatio": "debt_ratio",
        "MonthlyIncome": "monthly_income",
        "NumberOfOpenCreditLinesAndLoans": "num_open_credit_lines",
        "NumberOfTimes90DaysLate": "num_90_days_late",
        "NumberRealEstateLoansOrLines": "num_real_estate_loans",
        "NumberOfTime60-89DaysPastDueNotWorse": "num_60_89_days_past_due",
        "NumberOfDependents": "num_dependents",
    }
    def clean(val):
        return None if pd.isna(val) else val
    return {
        columns_to_args[key]: clean(value)
        for key,value in row.items()
        if key in columns_to_args
    }

# Data Descriptions

## Create Natural Language Descriptions

This code uses the `get_nl_description` function to generate natural language descriptions of the data for classification and reasoning tasks.

In [5]:
descriptions_df = data_df.apply(
    lambda row: get_nl_description(**map_row_to_args(row)),
    axis=1
).to_frame(name="Detailed Description")
descriptions_df.index.name = "Row ID"

with pd.option_context("display.max_colwidth", None):
    display(
        descriptions_df.head(5).style.set_properties(**{"text-align": "left"})
    )

Unnamed: 0_level_0,Detailed Description
Row ID,Unnamed: 1_level_1
1,"The individual is 45 years old with a monthly income of $9,120 and supports 2 dependents. Their revolving utilization of unsecured lines is approximately 76.6%. Their debt ratio stands at about 80.3%. They have a total of 13 open credit lines and loans, including 6 real estate loans or lines. Over the past period, they have had 2 instances of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."
2,"The individual is 40 years old with a monthly income of $2,600 and supports 1 dependent. Their revolving utilization of unsecured lines is approximately 95.7%. Their debt ratio stands at about 12.2%. They have a total of 4 open credit lines and loans, including 0 real estate loans or lines. Over the past period, they have had 0 instances of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."
3,"The individual is 38 years old with a monthly income of $3,042 and supports 0 dependents. Their revolving utilization of unsecured lines is approximately 65.8%. Their debt ratio stands at about 8.5%. They have a total of 2 open credit lines and loans, including 0 real estate loans or lines. Over the past period, they have had 1 instance of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 1 instance of being 90 or more days late."
4,"The individual is 30 years old with a monthly income of $3,300 and supports 0 dependents. Their revolving utilization of unsecured lines is approximately 23.4%. Their debt ratio stands at about 3.6%. They have a total of 5 open credit lines and loans, including 0 real estate loans or lines. Over the past period, they have had 0 instances of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."
5,"The individual is 49 years old with a monthly income of $63,588 and supports 0 dependents. Their revolving utilization of unsecured lines is approximately 90.7%. Their debt ratio stands at about 2.5%. They have a total of 7 open credit lines and loans, including 1 real estate loans or lines. Over the past period, they have had 1 instance of being 30 to 59 days past due, 0 instances of being 60 to 89 days past due, 0 instances of being 90 or more days late."


## Save and Verify Natural Language Descriptions Data

In [6]:
GiveMeSomeCredit.save_data_descriptions(descriptions_df)

dataframe_utils.print_dataframe_info(
    GiveMeSomeCredit.load_data_descriptions()
)

2025-09-12 19:44:08,051 - INFO - Saved DataFrame to processed directory: /Users/rina/llm-classification/data/GiveMeSomeCredit/processed/data_descriptions.csv


DataFrame Memory Usage: 75.42 MB


Unnamed: 0,dtype,count,non_null,null_count,unique,top,freq
Detailed Description,object,150000,150000,0,149326,The individual is 22 years old with a monthly ...,12


Unnamed: 0_level_0,Detailed Description
Row ID,Unnamed: 1_level_1
1,The individual is 45 years old with a monthly ...
2,The individual is 40 years old with a monthly ...
3,The individual is 38 years old with a monthly ...
4,The individual is 30 years old with a monthly ...
5,The individual is 49 years old with a monthly ...
...,...
149996,The individual is 74 years old with a monthly ...
149997,The individual is 44 years old with a monthly ...
149998,The individual is 58 years old with an unspeci...
149999,The individual is 30 years old with a monthly ...


# Classification Questions

## Create Natural Language Classification Questions

In [7]:
questions = [
    (
        "Will this individual experience serious delinquency—defined as being "
        "90 days or more past due—within the next two years?"
    ),
    (
        "Will this individual experience serious delinquency—defined as being "
        "90 days or more past due—within the next two years? "
        "Answer with yes or no only."
    ),
    (
        "Will this individual experience serious delinquency—defined as being "
        "90 days or more past due—within the next two years? "
        "Answer with no or yes only."
    ),
    (
        "Will this individual experience serious delinquency—defined as being "
        "90 days or more past due—within the next two years? "
        "Answer with no or yes and explain your reasoning."
    )
]

questions_df = pd.DataFrame({"Classification Question": questions})
questions_df.index.name = "Classification Question ID"

with pd.option_context("display.max_colwidth", None):
    display(
        questions_df.style.set_properties(**{"text-align": "left"})
    )

Unnamed: 0_level_0,Classification Question
Classification Question ID,Unnamed: 1_level_1
0,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years?
1,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years? Answer with yes or no only.
2,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years? Answer with no or yes only.
3,Will this individual experience serious delinquency—defined as being 90 days or more past due—within the next two years? Answer with no or yes and explain your reasoning.


## Save and Verify Natural Language Classification Questions

In [8]:
GiveMeSomeCredit.save_classification_questions(questions_df)

dataframe_utils.print_dataframe_info(
    GiveMeSomeCredit.load_classification_questions()
)

2025-09-12 19:44:08,877 - INFO - Saved DataFrame to processed directory: /Users/rina/llm-classification/data/GiveMeSomeCredit/processed/classification_questions.csv


DataFrame Memory Usage: 0.00 MB


Unnamed: 0,dtype,count,non_null,null_count,unique,top,freq
Classification Question,object,4,4,0,4,Will this individual experience serious delinq...,1


Unnamed: 0_level_0,Classification Question
Classification Question ID,Unnamed: 1_level_1
0,Will this individual experience serious delinq...
1,Will this individual experience serious delinq...
2,Will this individual experience serious delinq...
3,Will this individual experience serious delinq...


# Reasoning Prompts

## Create Natural Language Reasoning Prompts

In [9]:
reasoning_prompts = [
    (
        "Explain your reasoning."
    ),
    (
        "Explain your reasoning using a single paragraph of plain text."
    )
]

reasoning_prompts_df = pd.DataFrame({"Reasoning Prompt": reasoning_prompts})
reasoning_prompts_df.index.name = "Reasoning Prompt ID"

with pd.option_context("display.max_colwidth", None):
    display(
        reasoning_prompts_df.style.set_properties(**{"text-align": "left"})
    )

Unnamed: 0_level_0,Reasoning Prompt
Reasoning Prompt ID,Unnamed: 1_level_1
0,Explain your reasoning.
1,Explain your reasoning using a single paragraph of plain text.


## Save and Verify Natural Language Reasoning Prompts

In [10]:
GiveMeSomeCredit.save_reasoning_prompts(reasoning_prompts_df)

dataframe_utils.print_dataframe_info(
    GiveMeSomeCredit.load_reasoning_prompts()
)

2025-09-12 19:44:08,907 - INFO - Saved DataFrame to processed directory: /Users/rina/llm-classification/data/GiveMeSomeCredit/processed/reasonings_prompts.csv


DataFrame Memory Usage: 0.00 MB


Unnamed: 0,dtype,count,non_null,null_count,unique,top,freq
Reasoning Prompt,object,2,2,0,2,Explain your reasoning.,1


Unnamed: 0_level_0,Reasoning Prompt
Reasoning Prompt ID,Unnamed: 1_level_1
0,Explain your reasoning.
1,Explain your reasoning using a single paragrap...
