# Import library

In [3]:
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns

# import scipy.stats as stats
# from statsmodels.formula.api import ols
# import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
# from tabulate import tabulate

import warnings

warnings.filterwarnings("ignore")

In [4]:
import os
from dotenv import load_dotenv

In [8]:
import os
from dotenv import load_dotenv
dotenv_path = "../backend/.env"
load_dotenv(dotenv_path=dotenv_path)

True

In [9]:
load_dotenv(dotenv_path=dotenv_path)

True

# Load Data

In [11]:
import nbformat

def write_to_markdown(markdown_content, markdown_file):
    """
    Writes the markdown content to a markdown file.
    """
    with open(markdown_file, "w", encoding="utf-8") as file:
        file.write(markdown_content)
    return markdown_file

def extract_markdown_from_notebook(notebook_path):
    """
    Extracts markdown cells from a Jupyter notebook.
    """
    with open(notebook_path, "r", encoding="utf-8") as f:
        notebook = nbformat.read(f, as_version=4)
    
    markdown_cells = [
        cell["source"] for cell in notebook["cells"] if cell["cell_type"] == "markdown"
    ]
    return "\n\n".join(markdown_cells)

def notebook_to_markdown(notebook_path, markdown_path):
    """
    Converts a Jupyter notebook to a markdown file by extracting markdown cells.
    """
    markdown_content = extract_markdown_from_notebook(notebook_path)
    return write_to_markdown(markdown_content, markdown_path)

# Example usage
notebook_path = "../data/loan-prediction-eda.ipynb"  # Replace with the correct path
markdown_path = "../data/loan-prediction-eda.md"  # Replace with desired output path

try:
    extracted_markdown = extract_markdown_from_notebook(notebook_path)
    print("Extracted Markdown Content Length:", len(extracted_markdown))
    saved_path = notebook_to_markdown(notebook_path=notebook_path, markdown_path=markdown_path)
    print(f"Markdown saved to: {saved_path}")
except FileNotFoundError:
    print("Notebook file not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Extracted Markdown Content Length: 31645
Markdown saved to: ../data/loan-prediction-eda.md


In [12]:
# Example: Extract content from a notebook
notebook_path = "../data/loan-prediction-eda.ipynb"
notebook_content = extract_markdown_from_notebook(notebook_path)
print("Extracted Notebook Content:")
print(notebook_content[:500])  # Display first 500 characters

Extracted Notebook Content:
#### What is loan?
<br>
"In finance, a loan is the transfer of money by one party to another with an agreement to pay it back. The recipient, or borrower, incurs a debt and is usually required to pay interest for the use of the money." 
<br>

Resource by wikipedia: https://en.wikipedia.org/wiki/Loan

#### How the loan process works?
<br>
According to the article in investopedia.com: "When someone needs money, they apply for a loan from a bank, corporation, government, or other entity. The borrow


In [19]:
notebook_content

'#### What is loan?\n<br>\n"In finance, a loan is the transfer of money by one party to another with an agreement to pay it back. The recipient, or borrower, incurs a debt and is usually required to pay interest for the use of the money." \n<br>\n\nResource by wikipedia: https://en.wikipedia.org/wiki/Loan\n#### How the loan process works?\n<br>\nAccording to the article in investopedia.com: "When someone needs money, they apply for a loan from a bank, corporation, government, or other entity. The borrower may be required to provide specific details such as the reason for the loan, their financial history, Social Security number (SSN), and other information. The lender reviews this information as well as a person\'s debt-to-income (DTI) ratio to determine if the loan can be paid back.\n<br>\nBased on the applicant\'s creditworthiness, the lender either denies or approves the application. The lender must provide a reason should the loan application be denied. If the application is approv

## Text splitter to split Documents

In [60]:
from llama_index.core import Document
document = Document(text=notebook_content)

In [None]:
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.file import FlatReader
from pathlib import Path
from llama_index.core.schema import TextNode

def chunk_markdown(markdown_path):
    """
    Chunks a markdown file into smaller segments using LlamaIndex.
    
    Args:
    markdown_path (str): Path to the markdown file.
    chunk_size (int): Number of characters per chunk (approximate).
    
    Returns:
    list: A list of TextNode objects containing the chunks.
    """
    # Read the markdown file
    markdown_content = FlatReader().load_data(Path(markdown_path))
    
    # Use LlamaIndex's SimpleNodeParser for chunking
    parser = MarkdownNodeParser()
    return markdown_content, parser.get_nodes_from_documents(markdown_content)


# Example usage
markdown_path = "../data/loan-prediction-eda.md"  # Path to the markdown file

try:
    markdown_content, md_chunk_nodes = chunk_markdown(markdown_path)
    print(f"Generated {len(md_chunk_nodes)} chunks.")
    for i, node in enumerate(md_chunk_nodes[:5]):  # Display first 5 chunks
        print(f"Chunk {i+1}:")
        print(node.get_content())
        print(node.get_metadata_str())
        print()
except FileNotFoundError:
    print("Markdown file not found.")
except Exception as e:
    print(f"An error occurred: {e}")


Generated 20 chunks.
Chunk 1:
#### What is loan?
<br>
"In finance, a loan is the transfer of money by one party to another with an agreement to pay it back. The recipient, or borrower, incurs a debt and is usually required to pay interest for the use of the money." 
<br>

Resource by wikipedia: https://en.wikipedia.org/wiki/Loan
filename: loan-prediction-eda.md
extension: .md
header_path: /

Chunk 2:
#### How the loan process works?
<br>
According to the article in investopedia.com: "When someone needs money, they apply for a loan from a bank, corporation, government, or other entity. The borrower may be required to provide specific details such as the reason for the loan, their financial history, Social Security number (SSN), and other information. The lender reviews this information as well as a person's debt-to-income (DTI) ratio to determine if the loan can be paid back.
<br>
Based on the applicant's creditworthiness, the lender either denies or approves the application. The lender

## Generate Embedding

In [None]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

huggingface_embeddings = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

openai_embeddings = OpenAIEmbedding(
    model="text-embedding-ada-002"
)


In [107]:
for node in md_chunk_nodes:
    node_embedding = openai_embeddings.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

## Load Nodes into a Vector Store

In [93]:
from pinecone import Pinecone, Index, ServerlessSpec

In [94]:
api_key = os.environ["PINECONE_API_KEY"]
pc = Pinecone(api_key=api_key)

In [95]:
index_name = "llamaindex-rag-fs"

In [96]:
# dimensions are for text-embedding-ada-002
if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=1536,
        metric="euclidean",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [97]:
pinecone_index = pc.Index(index_name)

In [99]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [108]:
vector_store.add(md_chunk_nodes)

Upserted vectors: 100%|██████████| 20/20 [00:01<00:00, 16.63it/s]


['c3f75d5c-afd6-4420-88f9-a517a0908e2b',
 'c194ce2b-a020-4305-bcb0-aa0beeaddf69',
 'a07be9d1-9fe9-4aea-bfd4-fece2292e820',
 '4ed6fbfb-5426-4202-93f2-de19a03f0403',
 '5acf7f0f-4ff2-4f0e-8844-804c80dfe122',
 'a822768b-8015-49ff-8e90-2776a385dc03',
 'b82c42bb-a741-43c1-9ca4-de296550cd56',
 '3e263b84-f877-4aa7-9ef1-142b082aab35',
 '204c7314-7a08-4ec9-941c-c9691a23b962',
 '4769a99b-34b0-4fc1-a4b7-0d9514fa7d2c',
 '87b21693-3c21-4dbe-8c9b-87492ebd5576',
 'd088bf2f-5a8e-4717-9118-a4dcbfee3cb9',
 '071b2d97-1ae4-4c07-b7bf-bac9f1307fd4',
 '0737165c-d54e-4e25-8893-87f271359f57',
 '89b002d5-5eac-4b46-9ded-1df48498b00a',
 '691a666f-0f0e-4d7b-a1d3-b606935da57e',
 '3767b125-f1f5-420c-9ae2-959fc8ff843f',
 '05c507c9-8d99-4931-bccd-2d5f937664d4',
 'b8192456-021f-4d8f-b0cd-db654553b279',
 'ff2c9346-3719-4bd6-9977-993268ba64fa']

## Retrieve and Query from the Vector Store

In [109]:
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext

In [111]:
index = VectorStoreIndex.from_vector_store(vector_store)

In [112]:
query_engine = index.as_query_engine()

In [145]:
query_str = "how many credit records should be to avoid being rejected ?"

In [146]:
response = query_engine.query(query_str)
print(str(response))

To avoid being rejected, credit records should have credit scores above 740.


In [147]:
type(response.source_nodes[0])

llama_index.core.schema.NodeWithScore

In [148]:
# Inspect the result to see which documents are retrieved
for node_with_score in response.source_nodes:
    node = node_with_score.node  # Access the Node object from NodeWithScore
    print("Node id:", node.id_)  # Access the similarity score
    print("Node metadata:", node.get_content())  # Access the similarity score
    print("Similarity Score:", node_with_score.score)  # Access the similarity score
    print("---------------------")

Node id: 4769a99b-34b0-4fc1-a4b7-0d9514fa7d2c
Node metadata: #### 2. Credit Score

I am amazed to see the credit scores around 540 - 550 separate the loan status into two parts in a very clear way. In the other words, the loan status is highly related to the credit score.
<br>
It is also interesting to see the credit score that separates the loan status is not 579 which is the highest score of the "poor" credit score. In the other words, the poor credit scores which are above 540 - 550 still have a good chance of being approved by loan lenders. This could be attributed to lenders' flexibility or specific factors that impact approval decisions.

However, we also notice a puzzling trend: some of the applicants with high credit scores (above 740) were still rejected. Few of them have only applied for a small amount of the loan which is below the median amount. What is the reason?

By creating the subset, we have more than 5 applicants with more than 740 credit scores, their applications w