In [1]:
!pip install faker

Collecting faker
  Downloading Faker-30.6.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.6.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.6.0


In [2]:
from faker import Faker
import random

fake = Faker()
from datetime import date
from dateutil.relativedelta import relativedelta

six_months = date.today() - relativedelta(months=+6)
three_months = date.today() - relativedelta(months=+3)
months = [three_months, six_months]

# Generate demographic and personal information
def generate_customer_data():
    age = random.randint(20, 70)
    gender = random.choice(['Male', 'Female'])
    marital_status = random.choice(['Single', 'Married', 'Divorced', 'Widowed'])
    income_level = random.choice(['Low', 'Medium', 'High'])
    education = random.choice(['High School', 'College', 'University'])
    occupation = fake.job()
    residential_status = random.choice(['Owns house', 'Rents', 'Living with parents'])
    dependents = random.randint(0, 5),  # Number of dependents
    debt_to_income = round(random.uniform(0.1, 0.5), 2),  # Debt-to-income ratio
    credit_bureau = random.randint(760, 850)

    return {
        'Age': age,
        'Gender': gender,
        'Marital Status': marital_status,
        'Income Level': income_level,
        'Education': education,
        'Occupation': occupation,
        'Residential Status': residential_status,
        'Dependents': dependents,
        'Debt-to-Income': debt_to_income,
        'Credit_Bureau': credit_bureau
    }

# Function to generate bureau product inquiries
def generate_inquiries(last_months):
    inquiries = []
    today = fake.date_this_month()

    # Generate inquiries for the last `last_months` period
    for _ in range(random.randint(1, 5)):  # Random number of inquiries
        inquiry_date = fake.date_between(start_date=last_months, end_date=today)
        product_type = random.choice(['Personal Loan', 'Credit Card', 'Mortgage'])
        inquiries.append({'product_name': product_type, 'date': inquiry_date})

    return inquiries if inquiries else []

In [3]:
# Function to generate dataset
def generate_dataset(num_rows,months):
    data_rows = []

    for _ in range(num_rows):
        customer_data = generate_customer_data()
        last_3_months_inquiries = generate_inquiries(months[0])
        last_6_months_inquiries = generate_inquiries(months[1])

        # Initialize columns for each product type
        customer_row = {
            'Customer ID': fake.uuid4(),
            'Age': customer_data['Age'],
            'Gender': customer_data['Gender'],
            'Marital Status': customer_data['Marital Status'],
            'Income Level': customer_data['Income Level'],
            'Education': customer_data['Education'],
            'Occupation': customer_data['Occupation'],
            'Residential Status': customer_data['Residential Status'],
            'Dependents': customer_data['Dependents'],
            'Debt-to-Income': customer_data['Debt-to-Income'],
            'Credit_Bureau': customer_data['Credit_Bureau']
        }

        # Process last 3 months inquiries
        for product_type in ['Personal Loan', 'Credit Card', 'Mortgage']:
            inq_in_last_3_months = any(inq['product_name'] == product_type for inq in last_3_months_inquiries)
            customer_row[f'last_3months_{product_type.replace(" ", "_").lower()}_inq'] = inq_in_last_3_months

        # Process last 6 months inquiries
        for product_type in ['Personal Loan', 'Credit Card', 'Mortgage']:
            inq_in_last_6_months = any(inq['product_name'] == product_type for inq in last_6_months_inquiries)
            customer_row[f'last_6months_{product_type.replace(" ", "_").lower()}_inq'] = inq_in_last_6_months

        data_rows.append(customer_row)

    return data_rows

# Example usage to generate 50 rows of data
dataset = generate_dataset(50, months)

In [4]:
# Example usage to generate 50 rows of data
dataset = generate_dataset(50, months)

In [5]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
df = pd.DataFrame(dataset)
df.to_csv("products_info.csv")

In [6]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Income Level,Education,Occupation,Residential Status,Dependents,Debt-to-Income,Credit_Bureau,last_3months_personal_loan_inq,last_3months_credit_card_inq,last_3months_mortgage_inq,last_6months_personal_loan_inq,last_6months_credit_card_inq,last_6months_mortgage_inq
0,926b9acd-fde2-40d7-99b7-f59c10ee22ea,53,Female,Married,Low,High School,"Engineer, control and instrumentation",Living with parents,"(4,)","(0.34,)",775,True,False,False,True,True,True
1,551a2b78-252d-499c-832e-ff67f24acd7b,21,Female,Single,Medium,High School,Marketing executive,Living with parents,"(0,)","(0.17,)",842,True,False,True,True,False,False
2,b7b1a011-d743-4c4e-9338-e745f9d14b3b,51,Male,Single,High,College,Ophthalmologist,Rents,"(4,)","(0.39,)",819,True,True,True,False,True,True
3,9f2597f8-430f-49eb-835a-4059715b5ace,30,Male,Divorced,Low,University,Fast food restaurant manager,Owns house,"(2,)","(0.38,)",760,True,True,True,False,False,True
4,4dce7345-ad7b-4b0c-8be2-3a0287fdd0d8,41,Male,Single,Medium,College,Pensions consultant,Living with parents,"(3,)","(0.17,)",798,False,False,True,False,True,True


In [7]:
dataset[0]

{'Customer ID': '926b9acd-fde2-40d7-99b7-f59c10ee22ea',
 'Age': 53,
 'Gender': 'Female',
 'Marital Status': 'Married',
 'Income Level': 'Low',
 'Education': 'High School',
 'Occupation': 'Engineer, control and instrumentation',
 'Residential Status': 'Living with parents',
 'Dependents': (4,),
 'Debt-to-Income': (0.34,),
 'Credit_Bureau': 775,
 'last_3months_personal_loan_inq': True,
 'last_3months_credit_card_inq': False,
 'last_3months_mortgage_inq': False,
 'last_6months_personal_loan_inq': True,
 'last_6months_credit_card_inq': True,
 'last_6months_mortgage_inq': True}

In [8]:
df['content'] = [f"Based on the following customer data: {data}, suggest suitable banking lending products." for data in dataset]
df.head()

Unnamed: 0,Customer ID,Age,Gender,Marital Status,Income Level,Education,Occupation,Residential Status,Dependents,Debt-to-Income,Credit_Bureau,last_3months_personal_loan_inq,last_3months_credit_card_inq,last_3months_mortgage_inq,last_6months_personal_loan_inq,last_6months_credit_card_inq,last_6months_mortgage_inq,content
0,926b9acd-fde2-40d7-99b7-f59c10ee22ea,53,Female,Married,Low,High School,"Engineer, control and instrumentation",Living with parents,"(4,)","(0.34,)",775,True,False,False,True,True,True,Based on the following customer data: {'Custom...
1,551a2b78-252d-499c-832e-ff67f24acd7b,21,Female,Single,Medium,High School,Marketing executive,Living with parents,"(0,)","(0.17,)",842,True,False,True,True,False,False,Based on the following customer data: {'Custom...
2,b7b1a011-d743-4c4e-9338-e745f9d14b3b,51,Male,Single,High,College,Ophthalmologist,Rents,"(4,)","(0.39,)",819,True,True,True,False,True,True,Based on the following customer data: {'Custom...
3,9f2597f8-430f-49eb-835a-4059715b5ace,30,Male,Divorced,Low,University,Fast food restaurant manager,Owns house,"(2,)","(0.38,)",760,True,True,True,False,False,True,Based on the following customer data: {'Custom...
4,4dce7345-ad7b-4b0c-8be2-3a0287fdd0d8,41,Male,Single,Medium,College,Pensions consultant,Living with parents,"(3,)","(0.17,)",798,False,False,True,False,True,True,Based on the following customer data: {'Custom...


In [9]:
df['content'][0]

"Based on the following customer data: {'Customer ID': '926b9acd-fde2-40d7-99b7-f59c10ee22ea', 'Age': 53, 'Gender': 'Female', 'Marital Status': 'Married', 'Income Level': 'Low', 'Education': 'High School', 'Occupation': 'Engineer, control and instrumentation', 'Residential Status': 'Living with parents', 'Dependents': (4,), 'Debt-to-Income': (0.34,), 'Credit_Bureau': 775, 'last_3months_personal_loan_inq': True, 'last_3months_credit_card_inq': False, 'last_3months_mortgage_inq': False, 'last_6months_personal_loan_inq': True, 'last_6months_credit_card_inq': True, 'last_6months_mortgage_inq': True}, suggest suitable banking lending products."

In [10]:
!pip install langchain langchain-community langchain-core transformers

Collecting langchain
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain-core
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.136-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.0-py3-none-any.whl.metadata (3.5 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (fro

In [11]:
from langchain.docstore.document import Document

# Prepare documents for LangChain
documents = []
for _, row in df.iterrows():
    documents.append(Document(page_content=row["content"], metadata={"class": row["Age"]}))

In [12]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [13]:
!pip install chromadb
!pip install bitsandbytes accelerate

Collecting chromadb
  Downloading chromadb-0.5.15-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.27.0-py3

In [14]:
from langchain_community.embeddings import HuggingFaceEmbeddings
hg_embeddings = HuggingFaceEmbeddings()

  hg_embeddings = HuggingFaceEmbeddings()
  hg_embeddings = HuggingFaceEmbeddings()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
from langchain.vectorstores import Chroma

persist_directory = '/content/'

langchain_chroma = Chroma.from_documents(
    documents=documents,
    collection_name="recommendation_engine",
    embedding=hg_embeddings,
    persist_directory=persist_directory
)

In [16]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

model_id = 'HuggingFaceH4/zephyr-7b-beta'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

print(device)

cuda:0


In [17]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

model_config = transformers.AutoConfig.from_pretrained(
   model_id,
    trust_remote_code=True,
    max_new_tokens=1024
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [18]:
# Initialize the query pipeline with increased max_length
query_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    max_length=6000,  # Increase max_length
    max_new_tokens=500,  # Control the number of new tokens generated
    device_map="auto",
)

In [19]:
from IPython.display import display, Markdown
def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

llm = HuggingFacePipeline(pipeline=query_pipeline)

question = "What is Recommendation Engie and How it used in Finance Domain?"
response = llm(prompt=question)

full_response =  f"Question: {question}\nAnswer: {response}"
display(Markdown(colorize_text(full_response)))

  llm = HuggingFacePipeline(pipeline=query_pipeline)
  response = llm(prompt=question)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=500) and `max_length`(=6000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




**<font color='red'>Question:</font>** What is Recommendation Engie and How it used in Finance Domain?


**<font color='green'>Answer:</font>** What is Recommendation Engie and How it used in Finance Domain?

Recommendation Engie is a sub-field of Machine Learning which is used to predict the preferences or the interests of a user based on their past behavior. It is used to suggest products, services, or content to the users based on their preferences.

In the finance domain, recommendation engines are used to suggest investment opportunities, financial products, and services to the users based on their financial goals, risk tolerance, and investment history.

The recommendation engine in finance domain is used to:

1. Personalize the investment experience: By analyzing the user's financial data, investment history, and risk tolerance, the recommendation engine can suggest investment opportunities that are tailored to the user's specific needs and goals.

2. Improve investment performance: By suggesting investment opportunities that are aligned with the user's financial goals and risk tolerance, the recommendation engine can help the user achieve better investment performance.

3. Reduce investment risk: By suggesting investment opportunities that are aligned with the user's risk tolerance, the recommendation engine can help the user avoid investments that are too risky for their financial situation.

4. Enhance user experience: By providing personalized investment recommendations, the recommendation engine can enhance the user's overall investment experience and make it more convenient and efficient.

Some popular recommendation algorithms used in finance domain are:

1. Collaborative Filtering: This algorithm suggests investment opportunities based on the user's past investment behavior and the investment behavior of similar users.

2. Content-Based Filtering: This algorithm suggests investment opportunities based on the user's investment preferences and financial goals.

3. Hybrid Filtering: This algorithm combines Collaborative Filtering and Content-Based Filtering to provide more accurate and personalized investment recommendations.

4. Deep Learning-based Recommendation: This algorithm uses neural networks to analyze the user's financial data and investment history to suggest investment opportunities.

In summary, recommendation engines are a powerful tool in the finance domain that can help users achieve better investment performance, reduce investment risk, and enhance their overall investment experience. By using recommendation algorithms like Collaborative Filtering, Content-Based Filtering, Hybrid Filtering, and Deep Learning-based Recommendation, financial institutions can provide more personalized and accurate investment recommendations to their users.