### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os
import pdfplumber
import joblib

#### Ingestion

In [2]:
def split_pdf_into_chunks(pdf_path, max_tokens=5, overlap=2):
    filename = os.path.basename(pdf_path)
    chunks = []
    with pdfplumber.open(pdf_path) as pdf:

        for page_num in range(len(pdf.pages)):
            page = pdf.pages[page_num]
            text = page.extract_text()

            tokens = text.split()  # Simple tokenization by whitespace

            for i in range(0, len(tokens), max_tokens-overlap):
                chunk = tokens[i:i + max_tokens]

                # Store page number and text chunk
                chunks.append([filename, pdf_path, page_num + 1, ' '.join(chunk)])

    return chunks


In [None]:
pdf_path = r'data\data.pdf'

chunks = split_pdf_into_chunks(pdf_path, max_tokens=250, overlap=50)

In [None]:
df = pd.DataFrame(chunks, columns=['filename', 'filepath', 'pg_no', 'text'])


In [6]:
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['text'])

In [7]:
pkl_var = [tfidf, tfidf_matrix, df]

In [None]:
joblib.dump(pkl_var, 'tfidf_datadf_01.pkl')

['keyword_search_var.pkl']

#### Inference

In [1]:
from llama_index.llms.ollama import Ollama
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import joblib

In [2]:
keyword_search_var = joblib.load('tfidf_datadf_01.pkl')
tfidf = keyword_search_var[0]
tfidf_matrix = keyword_search_var[1]
df = keyword_search_var[2]

In [None]:
query = ['explain homeostasis ?']
query_matrix = tfidf.transform(query)

In [None]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, query_matrix)
l = sorted(enumerate(cosine_sim), key=lambda i: i[1], reverse=True)
pg_list = []
for [pg, sc] in l[:5]:
    pg_list.append(pg)
pg_list

In [5]:
# chunk_df = df.iloc[sorted(pg_list),:]
chunk_df = df.iloc[pg_list,:]
context = "Chunk: ".join(chunk_df['text'].to_list())

In [6]:
model_name = 'llama3b'
llm = Ollama(
    model=model_name,
    generate_kwargs={"temperature": 0.1,  "top_k": 5, "top_p": 0.7},
    device_map='auto'
    )

In [9]:
prompt = f"""Generate answer for question {query[0]} with following context / chunks:\n{context}. If you don't know the answer, do not include any other text in the response.
# """

In [10]:
response = llm.complete(prompt)

In [None]:
print(response.text)