In [1]:
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("credit_risk_reto.csv")

# EDA

1. The dataset consist on 1k rows.
2. Columns Saving accounts and Checking account are the only ones with missing values.

In [3]:
from preprocessing import pre_processing_data, JOB_DESCRIPTION, ACCOUNT_CATEGORY_CONVERSION

In [4]:
df_process, account_size_stats = pre_processing_data("credit_risk_reto.csv")

  savings_account = columns[0]
  checking_account = columns[1]


You are a credit analysist, deciding if a given loan is good or bad. "good", the loan should be paid without issues. "bad", it is probable thatis paid in delay or not paid at all.
Consider the following variables to decide:
- Type of job: {{job}}. Options ["unskilled and non-resident", "unskilled and resident", "skilled", "highly skilled"]
- Housing Situation: {{housing}}. Options: ["free", "own", "rent"]
- Loan Category: {{category}}. Options: ["self development", "maintenance", "recreational"]
- Account Size: {{account}}. Options: ["no data", "little", "moderate", "rich", "quite rich"]
- Monthly Payment in usd: {{monthly_payment}}. Numerical type.
Just respond in json format, with the "loan": "bad" or "good". 

In [6]:
from bedrock_mistral_prediction import classify_using_llm

In [None]:
df_process_with_classes = classify_using_llm(df_process)

In [25]:
import boto3

In [26]:
client = boto3.client('bedrock-runtime', region_name='us-east-1')

In [13]:
df_process.iloc[0]

Sex                             male
Job                          skilled
Housing                          own
Credit amount                   1169
Duration                           6
Purpose                     radio/TV
Monthly_Payment                195.0
Loan Category           recreational
Age Group                      61_70
Account Size                  little
High Monthly_Payment          normal
Name: 0, dtype: object

In [27]:
df_converted.iloc[0]

Sex                             male
Job                          skilled
Housing                          own
Credit amount                   1169
Duration                           6
Purpose                     radio/TV
Monthly_Payment                195.0
Loan Category           recreational
Age Group                      61_70
Account Size                  little
High Monthly_Payment          normal
Name: 0, dtype: object

In [28]:
from tqdm.notebook import tqdm

In [41]:
def return_prompt(job, housing, category, account, high_payment, months):
    return f"You are a credit analyst, deciding if a given loan is good or bad. 'good', the loan should be paid without issues. 'bad', it is probable that it is paid in delay or not paid at all. Consider the following variables to decide: Type of job: '{job}'. Options ['unskilled and non-resident', 'unskilled and resident', 'skilled', 'highly skilled']. Housing Situation: '{housing}'. Options: ['free', 'own', 'rent']. Loan Category: '{category}'. Options: ['self development', 'maintenance', 'recreational']. Account Size: '{account}'. Options: ['no data', 'little', 'moderate', 'rich', 'quite rich']. High Monthly Payment: '{high_payment}'. Options: ['normal', 'high']. Duration (in months): {months}. Just respond in json format, with the 'loan': 'bad' or 'good'."

In [42]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")

In [49]:
tokens = []
for i, row in tqdm(df_converted.iterrows(), total=len(df_converted)):
    job = row["Job"]
    housing = row["Housing"]
    category = row["Loan Category"]
    account = row["Account Size"]
    high_payment = row["High Monthly_Payment"]
    months = row["Duration"]
    kwargs = {
        "job": job,
        "housing": housing,
        "category": category,
        "account": account,
        "high_payment": high_payment,
        "months": months
    }
    prompt = return_prompt(**kwargs)
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens.append(len(tokenizer.encode(prompt)))

df_converted["Prompt Length"] = tokens

  0%|          | 0/1000 [00:00<?, ?it/s]

In [51]:
prices = {"mistral.mistral-small-2402-v1:0": 0.001 * 10**-3}

In [60]:
print(f"Running mistral small with cost: {prices['mistral.mistral-small-2402-v1:0']*df_converted['Prompt Length'].sum()} USD")
# We can neglect the output cost because it is very small response

Running mistral small with cost: 0.191899 USD


### Running Mistral

In [71]:
import time

def call_model(model_id, prompt):
    response = client.invoke_model(
    modelId=model_id,
    contentType="application/json",
    accept="application/json",
    body=f"{{\"prompt\":\"<s>[INST] {prompt} [/INST]\", \"max_tokens\":12, \"temperature\":0, \"top_p\":1, \"top_k\":10}}"
    )

    # return json.loads(json.loads(response["body"].read().decode("utf-8"))["outputs"][0]["text"])
    return response["body"].read().decode("utf-8")

model_id = "mistral.mistral-small-2402-v1:0"

responses = []
for i, row in tqdm(df_converted.iterrows(), total=len(df_converted)):
    job = row["Job"]
    housing = row["Housing"]
    category = row["Loan Category"]
    account = row["Account Size"]
    high_payment = row["High Monthly_Payment"]
    months = row["Duration"]
    kwargs = {
        "job": job,
        "housing": housing,
        "category": category,
        "account": account,
        "high_payment": high_payment,
        "months": months
    }
    prompt = return_prompt(**kwargs)

    retries = 0
    while retries < 5:
        try:
            response = call_model(model_id, prompt)
            responses.append(response)
            break
        except client.exceptions.ThrottlingException:
            wait_time = 2 ** retries
            print(f"{i} ThrottlingException: Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            retries += 1
    else:
        print("Failed to get response after multiple retries.")
        responses.append(None)

    time.sleep(5)

  0%|          | 0/1000 [00:00<?, ?it/s]

1 ThrottlingException: Retrying in 1 seconds...
3 ThrottlingException: Retrying in 1 seconds...
4 ThrottlingException: Retrying in 1 seconds...
6 ThrottlingException: Retrying in 1 seconds...
7 ThrottlingException: Retrying in 1 seconds...
8 ThrottlingException: Retrying in 1 seconds...
9 ThrottlingException: Retrying in 1 seconds...
11 ThrottlingException: Retrying in 1 seconds...
14 ThrottlingException: Retrying in 1 seconds...
15 ThrottlingException: Retrying in 1 seconds...
16 ThrottlingException: Retrying in 1 seconds...
18 ThrottlingException: Retrying in 1 seconds...
24 ThrottlingException: Retrying in 1 seconds...
25 ThrottlingException: Retrying in 1 seconds...
26 ThrottlingException: Retrying in 1 seconds...
27 ThrottlingException: Retrying in 1 seconds...
29 ThrottlingException: Retrying in 1 seconds...
33 ThrottlingException: Retrying in 1 seconds...
34 ThrottlingException: Retrying in 1 seconds...
35 ThrottlingException: Retrying in 1 seconds...
36 ThrottlingException: Ret

In [83]:
json.loads(json.loads(responses[0])['outputs'][0]['text'])

{'loan': 'good'}

In [127]:
clasifications = []
for response in responses:
    try:
        clasification = json.loads(json.loads(response)['outputs'][0]['text'])['loan']
    except:
        string_with_info = json.loads(response)["outputs"][0]["text"].replace("{\n", "").replace("risky", 'risky"').strip()
        clasification = string_with_info.split(":")[1].strip().replace('"', "")
    clasifications.append(clasification)

In [130]:
df_converted["Prediction Mistral Small"] = clasifications

In [133]:
df_converted["Prediction Mistral Small"] = df_converted["Prediction Mistral Small"].replace("potentially risky", "bad")
df_converted["Prediction Mistral Small"].value_counts()

Prediction Mistral Small
good    813
bad     187
Name: count, dtype: int64

In [135]:
df_converted

Unnamed: 0,Sex,Job,Housing,Credit amount,Duration,Purpose,Monthly_Payment,Loan Category,Age Group,Account Size,High Monthly_Payment,Prompt Length,Prediction Mistral,Prediction Mistral Small
0,male,skilled,own,1169,6,radio/TV,195.0,recreational,61_70,little,normal,192,"{""outputs"":[{""text"":"" {\n \""loan\"": \""good\""\...",good
1,female,skilled,own,5951,48,radio/TV,124.0,recreational,21_30,moderate,normal,193,"{""outputs"":[{""text"":"" {\n \""loan\"": \""good\""\...",good
2,male,unskilled and resident,own,2096,12,education,175.0,development,41_50,little,normal,193,"{""outputs"":[{""text"":"" {\n \""loan\"": \""bad\""\n...",bad
3,male,skilled,free,7882,42,furniture/equipment,188.0,maintenance,41_50,little,normal,190,"{""outputs"":[{""text"":"" {\n \""loan\"": \""good\""\...",good
4,male,skilled,free,4870,24,car,203.0,development,51_60,little,high,190,"{""outputs"":[{""text"":"" {\n \""loan\"": \""good\""\...",good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,female,unskilled and resident,own,1736,12,furniture/equipment,145.0,maintenance,31_40,little,normal,193,"{""outputs"":[{""text"":"" {\n \""loan\"": \""bad\""\n...",bad
996,male,highly skilled,own,3857,30,car,129.0,development,41_50,little,normal,191,"{""outputs"":[{""text"":"" {\n \""loan\"": \""good\""\...",good
997,male,skilled,own,804,12,radio/TV,67.0,recreational,31_40,little,normal,192,"{""outputs"":[{""text"":"" {\n \""loan\"": \""good\""\...",good
998,male,skilled,free,1845,45,radio/TV,41.0,recreational,21_30,little,normal,192,"{""outputs"":[{""text"":"" {\n \""loan\"": \""good\""\...",good


In [134]:
df_converted.to_csv("credit_risk_reto_processed.csv", index=False)