In [1]:
directory_path = "/Mistral_/Data_Abstract"

In [2]:
import os
import uuid
import re
import json
from transformers import AutoTokenizer,AutoModel
import numpy as np
import torch
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    base = os.path.basename(file_path)
    sku = os.path.splitext(base)[0]
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
doc_id = str(uuid.uuid4())


In [4]:
def chunking(directory_path, model_name, chunk_size,para_seperator="/n/n", separator=" "):

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    documents = {}
    all_chunks = {}
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        base = os.path.basename(file_path)
        sku = os.path.splitext(base)[0]
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            doc_id = str(uuid.uuid4())

            paragraphs = re.split(para_seperator, text)

            for paragraph in paragraphs:
                words = paragraph.split(separator)
                current_chunk_str = ""
                chunk = []
                for word in words:
                    if current_chunk_str:
                        new_chunk = current_chunk_str + separator + word
                    else:
                        new_chunk = current_chunk_str + word
                    if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
                        current_chunk_str = new_chunk
                    else:
                        if current_chunk_str:
                            chunk.append(current_chunk_str)
                        current_chunk_str = word

                if current_chunk_str:
                    chunk.append(current_chunk_str)

                for chunk in chunk:
                    chunk_id = str(uuid.uuid4())
                    all_chunks[chunk_id] = {"text": chunk, "metadata": {"file_name":sku}}
        documents[doc_id] = all_chunks
    return documents

In [5]:
model_name = "BAAI/bge-small-en-v1.5"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def map_document_embeddings(documents, tokenizer, model):
    mapped_document_db = {}
    for id, dict_content in documents.items():
        mapped_embeddings = {}
        for content_id, text_content in dict_content.items():
            text = text_content.get("text")
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
            mapped_embeddings[content_id] = embeddings
        mapped_document_db[id] = mapped_embeddings
    return mapped_document_db

In [6]:
path = " /Users/pookie/PycharmProjects/PythonProject1/.venv/Mistral_/Data.json "

In [7]:
def save_json(path, data):
    with open(path, 'w') as f:
        json.dump(data, f, indent=4)

In [8]:
Chunking = chunking(directory_path, model_name, chunk_size=100)
Embeddings_map = map_document_embeddings(Chunking,AutoTokenizer.from_pretrained(model_name),AutoModel.from_pretrained(model_name) )

In [11]:
def retrieve_information(query, top_k):
    query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    query_embeddings = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze()
    query_embeddings=query_embeddings.tolist()
    query_embeddings=np.array(query_embeddings)

    scores = {}
    for doc_id, chunk_dict in Embeddings_map.items():
        for chunk_id, chunk_embeddings in chunk_dict.items():
            chunk_embeddings = np.array(chunk_embeddings)

            normalized_query = np.linalg.norm(query_embeddings)
            normalized_chunk = np.linalg.norm(chunk_embeddings)

            if normalized_chunk == 0 or normalized_query == 0:
                score == 0
            else:
                score = np.dot(chunk_embeddings, query_embeddings)/ (normalized_chunk * normalized_query)

            scores[(doc_id, chunk_id )] = score
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]
    top_results=[]
    for ((doc_id, chunk_id), score) in sorted_scores:
        results = (doc_id, chunk_id, score)
        top_results.append(results)

    return top_results

In [12]:
save_json('doc_store.json', Chunking)
save_json('vector_store.json', Embeddings_map)


In [13]:
def read_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data

document_data= read_json("doc_store.json")
vector_data= read_json("vector_store.json")

In [14]:
query = " What is the title of the paper ?"

In [15]:
first_match = retrieve_information(query,1)
first_match

[('193a774a-9f56-40c9-b1fd-80a802cf90b3',
  'b87151de-b7e2-45a7-b496-8b8b110903f4',
  0.5217984002372201)]

In [16]:
doc_id = first_match[0][0]
doc_id

'193a774a-9f56-40c9-b1fd-80a802cf90b3'

In [17]:
chunk_id = first_match[0][1]

In [18]:
related_text = document_data[doc_id][chunk_id]
print(related_text)

{'text': 'on\npharmaceuticals, addressing astronaut safety,\ndrug discovery acceleration, benefits for Earth,\nand drug development needs for space missions.\nInnovations like 3D printing and microfluidics\nare crucial for maintaining medication efficacy\nand supporting long-duration missions, ensuring\nthe success and sustainability of human space\nexploration.\nWritten by : Saksham Tiwari and Heramb\n\nPaper title: Unraveling the financial dimension of Space endeavors\nAbstract : Astro-economics, an emerging interdisciplinary field, examines the economic principles\nand implications of human activities in', 'metadata': {'file_name': 'Abstract'}}


In [19]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [20]:
import os
from langchain_mistralai import ChatMistralAI  # Changed import
from langchain.prompts import ChatPromptTemplate
from dotenv import load_dotenv

load_dotenv()

# Initialize Mistral model (replaces OpenAI setup)
model = ChatMistralAI(
    mistral_api_key=os.environ["MISTRAL_API_KEY"],
    model="mistral-large-latest"
)

In [21]:
def generate_lin_response(mistral_model, query, relavent_text):
    template = """
    You are an intelligent search engine. You will be provided with some retrieved context, as well as the users query.

    Your job is to understand the request, and answer based on the retrieved context.
    Here is context:
    <context>
    {context}
    </context>

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template=template)
    chain = prompt | mistral_model

    # Fixed parameter casing (Context -> context)
    response = chain.invoke({"context": relavent_text["text"], "question": query})
    return response

In [22]:
# Usage remains the same
generate_lin_response(model, query, related_text)

AIMessage(content='The title of the paper is "Unraveling the financial dimension of Space endeavors."', additional_kwargs={}, response_metadata={'token_usage': {'prompt_tokens': 221, 'total_tokens': 241, 'completion_tokens': 20}, 'model': 'mistral-large-latest', 'finish_reason': 'stop'}, id='run-2647db0a-fee2-4fa6-ad29-3873a134b959-0', usage_metadata={'input_tokens': 221, 'output_tokens': 20, 'total_tokens': 241})