# Building the chatbot using Deepseek R1 1.5B model on Ollama - hosted locally

## To run in local system

In [1]:
#import libraries
from openai import OpenAI
import pdfplumber
import re
import pandas as pd
import json
pdf_path = "NipponIndia-Short-Term-Fund-Jan-2024.pdf"

### Calling Ollama to act as server to locally stored Deepseek R1 1.5 B model

In [7]:
client = OpenAI(api_key="ollama", base_url="http://localhost:11434/v1/")

### Enable Custom Knowledge to DeepSeek

In [9]:
def get_response(messages, model="deepseek-r1:1.5b", temperature=0.7, top_p=0.85):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,  # Controls randomness (lower = more predictable)
        top_p=top_p              # Limits vocabulary diversity
    )
    
    cleaned_text = re.sub(r'<think>.*?</think>', '', response.choices[0].message.content, flags=re.DOTALL)
    return cleaned_text.strip()

# Example usage
# messages = [{'role': 'user', 'content': 'Tell me about space exploration.'}]
# print(get_response(messages, temperature=0.6, top_p=0.85, top_k=40))


In [11]:
prompt='What is the theme of DHS 2025?'
messages=[{"role":"user","content":prompt}]
print(get_response(messages))

I am sorry, I cannot answer that question.


**Loading the PDF document for context**

It was a big challenge as the document was complex. So following is the stratgey I have used:

* Extract text from page1
* Extract the tables and text separately
* Format the extracted tables correctly

In [3]:
def extract_specific_table(pdf_path, page_num, table_index):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_num - 1]
        tables = page.extract_tables()

        if 0 <= table_index < len(tables):  # Ensure valid index
            return tables[table_index]
        else:
            return None

def format_table(data):
    formatted_data = []
    
    for i in range(len(data)):
        key, value = data[i]
        
        # If key contains newline, split it and distribute parts across rows
        if "\n" in key:
            split_keys = key.split("\n")
            formatted_data.append([split_keys[0], value])  # First part remains
            for extra_key in split_keys[1:]:
                if i + 1 < len(data) and data[i + 1][0] is None:  # If next row has None, replace it
                    data[i + 1][0] = extra_key
                else:
                    formatted_data.append([extra_key, ""])  # Otherwise, add separately
        else:
            formatted_data.append([key, value])

    data_dict = {row[0]: row[1] for row in formatted_data}
    
    return  {key: (value if value is not None else " ") for key, value in data_dict.items()}


def extract_sections(pdf_path, page_num, bbox_list):
    extracted_data = {}

    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_num - 1]

        for bbox in bbox_list:
            cropped_page = page.within_bbox(bbox)
            text = cropped_page.extract_text()

            if text:
                lines = text.split("\n")
                title = lines[0]  # First line (usually the title in the red box)
                content = "\n".join(lines[1:])  # Remaining text as content
                extracted_data[title] = content.replace('\n',' ')

    return extracted_data

def extract_text_from_first_page(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        first_page = pdf.pages[0]  # Page indexing starts from 0
        text = first_page.extract_text()
        return text.replace('\n',' ').strip() if text else "No text found"


In [5]:
# getting important tables from page 2
scheme_info = format_table(extract_specific_table(pdf_path, page_num=2, table_index=2)) # Table 2 on pg2
scheme_attribute = format_table(extract_specific_table(pdf_path, page_num=2, table_index=0)) # Table 1 on pg2

# Extracting table no.2 from page 2. long table needs special treatment
portfolio_details_tab = extract_specific_table(pdf_path, page_num=2, table_index=1)
columns = [col.replace("\n", " ") if col else "Unknown" for col in portfolio_details_tab[1]]
data = portfolio_details_tab[2:]
portfolio_details_df = pd.DataFrame(data, columns=columns)

portfolio_details = portfolio_details_df.to_json(orient="records", indent=4) 


# getting sections from page 2
# Define bounding boxes manually (adjust based on your debugging)
bbox_list = [
    (40, 30, 300, 80),  # About Nipon
    (40, 80, 300, 210),  # "Who Should Invest" section
    (40, 220, 300, 310)  # "Current Strategy" section
]

about_scheme = extract_sections(pdf_path,2, bbox_list)

#first page text
first_page_text = extract_text_from_first_page(pdf_path)

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

In [41]:
scheme_attribute

{'Scheme Attributes as on January 31, 2024': ' ',
 'Weighted Average YTM*': '7.85%',
 'Weighted Average Maturity': '3.47 Years',
 'Modified Duration': '2.69 Years'}

In [7]:
scheme_info

{'Inception Date': 'December 18, 2002',
 'Fund Manager#': 'Sushil Budhia, Vivek Sharma',
 'Minimum Application': 'Rs. 5,000 & in multiples of Re.1 thereafter',
 'Benchmark': '',
 'Amount': 'CRISIL Short Duration B-II Index',
 'Month end AUM as on': 'Rs. 5,967.53Crs',
 'January 31, 2024': '',
 'Plans & Options': 'i)Growth Plan/Direct Plan - Growth Plan:\nGrowth Option\nii) a) IDCW Plan/ Direct Plan - IDCW Plan\n- IDCW Option\nb) IDCW Plan/ Direct Plan - IDCW Plan\nwith the frequency of Monthly,\nQuarterly)\nBoth the above mentioned IDCW\nplan/ Direct Plan - IDCW Plan offers\nPayout of IDCW and Reinvestment\nof IDCW facility.',
 '(** If charged, the same shall': '',
 'be credited to the scheme': '',
 'immediately net of goods &': '',
 'service tax, if any)': '',
 'Exit Load**': 'Nil',
 '': ' '}

In [11]:
print(portfolio_details)

[
    {
        "Company\/Issuer":"Certificate of Deposit",
        "Rating":"",
        "% of Assets":"1.56%"
    },
    {
        "Company\/Issuer":"Axis Bank Limited",
        "Rating":"CRISIL A1+",
        "% of Assets":"1.56%"
    },
    {
        "Company\/Issuer":"Commercial Paper",
        "Rating":"",
        "% of Assets":"0.82%"
    },
    {
        "Company\/Issuer":"HDFC Bank Limited",
        "Rating":"CRISIL A1+",
        "% of Assets":"0.82%"
    },
    {
        "Company\/Issuer":"Corporate Bond",
        "Rating":"",
        "% of Assets":"57.43%"
    },
    {
        "Company\/Issuer":"REC Limited",
        "Rating":"CRISIL AAA",
        "% of Assets":"6.83%"
    },
    {
        "Company\/Issuer":"Small Industries Dev Bank of India",
        "Rating":"ICRA\nAAA\/CRISIL\nAAA",
        "% of Assets":"5.21%"
    },
    {
        "Company\/Issuer":"National Bank For Agriculture and Rural\nDevelopment",
        "Rating":"CRISIL\nAAA\/ICRA\nAAA",
        "% of Assets":"5.