In [76]:
import pandas as pd
import numpy as np
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

In [77]:
DATA_PATH = r'C:\QpiAi'

In [5]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()
document = load_documents()
len(document)

103

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
def split_documents(documents: list[document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1700,
        chunk_overlap=170,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)
doc = split_documents(document)
len(doc)

262

In [78]:
stored_meta = pd.read_csv('arxiv_metadata.csv')

In [79]:
stored_meta

Unnamed: 0,arxiv_id,Author,Title,Abstract,publication_date
0,2409.18128,"Wenliang Zhao, Minglei Shi, Xumin Yu, Jie Zhou...",FlowTurbo: Towards Real-time Flow-Based Image ...,Building on the success of diffusion models in...,2024-09
1,2409.18127,"Fangzhou Hong, Vladimir Guzov, Hyo Jin Kim, Yu...",EgoLM: Multi-Modal Language Model of Egocentri...,"As the prevalence of wearable devices, learnin...",2024-09
2,2409.18125,"Chenming Zhu, Tai Wang, Wenwei Zhang, Jiangmia...",LLaVA-3D: A Simple yet Effective Pathway to Em...,Recent advancements in Large Multimodal Models...,2024-09
3,2409.18124,"Jing He, Haodong Li, Wei Yin, Yixun Liang, Leh...",Lotus: Diffusion-based Visual Foundation Model...,Leveraging the visual priors of pre-trained te...,2024-09
4,2409.18119,"Yuexi Du, John Onofrey, Nicha C. Dvornek",Multi-View and Multi-Scale Alignment for Contr...,Contrastive Language-Image Pre-training (CLIP)...,2024-09


In [10]:
def calculate_chunk_ids(chunks):
    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index
    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [11]:
def metadata_adding(doc, stored_meta):
    document = doc
    stored_metadata = stored_meta

    for doc in document:
        # Extract the paper ID from the source filename
        source_path = doc.metadata['source']
        
        # Find the corresponding metadata entry using pandas DataFrame filtering
        matching_metadata = stored_metadata[
            stored_metadata['arxiv_id'].astype(str).str.contains(
                source_path.split('_')[-1].replace('.pdf', '')
            )
        ]

        if not matching_metadata.empty:
            # Access values from the matching metadata entry
            author = matching_metadata.iloc[0]['Author']
            publication_date = matching_metadata.iloc[0]['publication_date']
            
            # Add the author and publication_date directly to the chunk's metadata
            doc.metadata['Author'] = author
            doc.metadata['publication_date'] = publication_date

    return document  # Return the document with the updated metadata

In [81]:
from sentence_transformers import SentenceTransformer

In [82]:
model = SentenceTransformer('all-MiniLM-L6-v2')



In [80]:
api_key = 'd7204d21-cb62-4544-b49c-9169b420c0e1'

In [15]:
import time
from pinecone import Pinecone, ServerlessSpec
from pinecone.exceptions import PineconeApiException

In [None]:
def add_to_pinecone(chunks: list[document]):
    pc = Pinecone(api_key=api_key)
    index_name = 'embeddings6'

    # Initialize Pinecone
    try:
        if index_name not in pc.list_indexes():
            pc.create_index(name=index_name, dimension=384, spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"))
            print('✅ Creating new index')
    except PineconeApiException as e:
        print(f'✅ Index "{index_name}" already exists')

    # Connect to the existing index
    index = pc.Index(index_name)

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)
    chunks_with_ids = metadata_adding(chunks_with_ids, stored_meta)

    # Fetch existing IDs
    existing_ids = set()
    ids_to_check = [chunk.metadata["id"] for chunk in chunks_with_ids]

    try:
        fetch_response = index.fetch(ids=ids_to_check)
        if fetch_response and 'vectors' in fetch_response:

            existing_ids = set(fetch_response['vectors'].keys())
    except Exception as e:
        print(f"Error fetching existing IDs: {e}")

    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Check for new chunks
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
        else:
            print(f"Duplicate found for ID: {chunk.metadata['id']} - not adding.")

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")

        # Prepare all new chunks for upsert
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        batch_texts = [str(chunk.page_content) for chunk in new_chunks]  # Force conversion to str
        embedded_texts = [model.encode(text) for text in batch_texts]  # Get embeddings
        
        vectors_with_metadata = []
        
        for idx, embedding in enumerate(embedded_texts):
            authors_list = [author.strip() for author in new_chunks[idx].metadata.get('Author', 'Unknown').split(',')]
            
            vectors_with_metadata.append({
                'id': new_chunk_ids[idx],
                'values': embedding,
                'metadata': {
                    'text': batch_texts[idx],  # Store the text as metadata
                    'Author': authors_list,  # Author metadata
                    'publication_date': new_chunks[idx].metadata.get('publication_date', 'Unknown')  # Publication date metadata
                }
            })
        
        # Upsert embeddings into Pinecone
        try:
            index.upsert(vectors=vectors_with_metadata)
            print("✅ All new documents added")
        except Exception as e:
            print(f"Error upserting documents: {e}")
    else:
        print("✅ No new documents to add")

In [2]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\ashut\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [18]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
Collecting spacy<3.6.0,>=3.5.0
  Downloading spacy-3.5.4-cp310-cp310-win_amd64.whl (12.2 MB)
Collecting smart-open<7.0.0,>=5.2.1
  Downloading smart_open-6.4.0-py3-none-any.whl (57 kB)
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.12-cp310-cp310-win_amd64.whl (1.5 MB)
Collecting typer<0.10.0,>=0.3.0
  Downloading typer-0.9.4-py3-none-any.whl (45 kB)
Collecting pathy>=0.10.0
  Downloading pathy-0.11.0-py3-none-any.whl (47 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.18-cp310-cp310-win_amd64.whl (2.1 MB)
Collecting pathlib-abc==0.1.1
  Downloading pathlib_abc-0.1.1-py3-none-any.whl (23 kB)
Installing collected packages: pydantic, typer, smart-open, pathlib-abc, thinc, pathy, spacy, en-core-web-sm
  Attempting uninstall: pydantic
    Found existing installation: pydantic 

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\ashut\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~hinc\\backends\\cblas.cp310-win_amd64.pyd'
Consider using the `--user` option or check the permissions.

You should consider upgrading via the 'c:\Users\ashut\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [20]:
!python -m spacy download en_core_web_lg 

Traceback (most recent call last):
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 187, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 146, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 110, in _get_module_details
    __import__(pkg_name)
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\site-packages\spacy\__init__.py", line 13, in <module>
    from . import pipeline  # noqa: F401
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\site-packages\spacy\pipeline\__init__.py", line 2, in <module>
    from .dep_parser import DependencyParser
  File "spacy\pipeline\dep_parser.pyx", line 1, in init spacy.pipeline.dep_parser
  File "spacy\pipeline\transition_parser.pyx", line 1, in init sp

In [22]:
!python -m spacy download en

Traceback (most recent call last):
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 187, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 146, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 110, in _get_module_details
    __import__(pkg_name)
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\site-packages\spacy\__init__.py", line 13, in <module>
    from . import pipeline  # noqa: F401
  File "c:\Users\ashut\AppData\Local\Programs\Python\Python310\lib\site-packages\spacy\pipeline\__init__.py", line 2, in <module>
    from .dep_parser import DependencyParser
  File "spacy\pipeline\dep_parser.pyx", line 1, in init spacy.pipeline.dep_parser
  File "spacy\pipeline\transition_parser.pyx", line 1, in init sp

In [23]:
!python -m spacy download en_core_web_sm


SyntaxError: invalid syntax. Perhaps you forgot a comma? (2997175802.py, line 2)

In [25]:
!pip install .tar.gz archive from path or URL
!pip install /Users/you/en_core_web_sm-2.2.0.tar.gz

Processing c:\qpiai\.tar.gz


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\QpiAi\\.tar.gz'

You should consider upgrading via the 'C:\Users\ashut\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Processing c:\users\you\en_core_web_sm-2.2.0.tar.gz


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\you\\en_core_web_sm-2.2.0.tar.gz'

You should consider upgrading via the 'C:\Users\ashut\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [27]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [28]:
import spacy
from fuzzywuzzy import fuzz, process
from pinecone import Pinecone

# Load Spacy NER model (using 'en_core_web_sm' or any other suitable model)
nlp = spacy.load("en_core_web_lg")

def extract_authors(query_text: str):
    """Extracts potential author names from the query using NER."""
    doc = nlp(query_text)
    authors = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]  # 'PERSON' label for detecting names
    return authors

def find_best_match(author_name: str, author_list: list):
    """Use fuzzy matching to find the closest match for an author name."""
    best_match = process.extractOne(author_name, author_list, scorer=fuzz.token_sort_ratio)
    return best_match

def retrieve_from_pinecone(query_text: str, top_k=5):
    # Initialize Pinecone
    pc = Pinecone(api_key=api_key)  # Update with your API key and environment
    index_name = 'embeddings5'

    # Connect to the existing index
    index = pc.Index(index_name)

    # Embed the query text using the same model used for upserts
    query_embedding = model.encode(query_text).tolist()  # Encode query into embedding

    # Perform similarity search in Pinecone
    results = index.query(vector=query_embedding, top_k=top_k, include_values=True, include_metadata=True)

    # Extract author names from query using NER
    query_authors = extract_authors(query_text)
    
    if query_authors:
        print(f"Detected potential authors in query: {query_authors}")
    else:
        print("No authors detected in the query")

    print(f"Top {top_k} results for the query '{query_text}':")
    
    # Iterate over search results
    for i, match in enumerate(results['matches']):
        print(f"\nResult {i+1}:")
        print(f"  - ID: {match['id']}")
        print(f"  - Score: {match['score']}")
        
        # Get authors from metadata
        result_authors = match['metadata'].get('Author', [])
        
        # Compare query authors with result authors using fuzzy matching
        if query_authors:
            for query_author in query_authors:
                best_match = find_best_match(query_author, result_authors)
                
                if best_match and best_match[1] > 80:  # Match threshold (tune based on your need)
                    print(f"  - Matched Author: {best_match[0]} (Similarity: {best_match[1]})")
                else:
                    print(f"  - No close match found for: {query_author}")
        else:
            print(f"  - Metadata Authors: {result_authors}")

    return results

# Example usage
query = "give the research paper by Yuexi Du"
top_k_results = retrieve_from_pinecone(query, top_k=10)

Detected potential authors in query: ['Yuexi Du']
Top 10 results for the query 'give the research paper by Yuexi Du published in September':

Result 1:
  - ID: C:\QpiAi\paper_2409.18125.pdf:8:3
  - Score: 0.364301205
  - No close match found for: Yuexi Du

Result 2:
  - ID: C:\QpiAi\paper_2409.18124.pdf:0:0
  - Score: 0.287131101
  - No close match found for: Yuexi Du

Result 3:
  - ID: C:\QpiAi\paper_2409.18128.pdf:10:2
  - Score: 0.267722964
  - No close match found for: Yuexi Du

Result 4:
  - ID: C:\QpiAi\paper_2409.18128.pdf:11:2
  - Score: 0.236149743
  - No close match found for: Yuexi Du

Result 5:
  - ID: C:\QpiAi\paper_2409.18124.pdf:10:1
  - Score: 0.228472888
  - No close match found for: Yuexi Du

Result 6:
  - ID: C:\QpiAi\paper_2409.18124.pdf:10:0
  - Score: 0.222014651
  - No close match found for: Yuexi Du

Result 7:
  - ID: C:\QpiAi\paper_2409.18124.pdf:5:0
  - Score: 0.212008938
  - No close match found for: Yuexi Du

Result 8:
  - ID: C:\QpiAi\paper_2409.18124.pdf:5

In [None]:
query = "Find papers by author: John Doe"
author = extract_author(query)

In [30]:
stored_meta['Author']

0    Wenliang Zhao, Minglei Shi, Xumin Yu, Jie Zhou...
1    Fangzhou Hong, Vladimir Guzov, Hyo Jin Kim, Yu...
2    Chenming Zhu, Tai Wang, Wenwei Zhang, Jiangmia...
3    Jing He, Haodong Li, Wei Yin, Yixun Liang, Leh...
4             Yuexi Du, John Onofrey, Nicha C. Dvornek
Name: Author, dtype: object

In [35]:
all_authors = stored_meta['Author'].str.split(',').tolist()
flattened_authors = [author.strip() for sublist in all_authors for author in sublist]

# Step 3: Get the unique authors
unique_authors = list(set(flattened_authors))

In [36]:
unique_authors

['Leheng Li',
 'Minglei Shi',
 'Tai Wang',
 'Nicha C. Dvornek',
 'Fangzhou Hong',
 'Hongbo Liu',
 'Jiangmiao Pang',
 'Kaiqiang Zhou',
 'Jing He',
 'Wenwei Zhang',
 'Haodong Li',
 'Jie Zhou',
 'Yuting Ye',
 'Ying-Cong Chen',
 'Richard Newcombe',
 'Yuexi Du',
 'Ziwei Liu',
 'Vladimir Guzov',
 'Bingbing Liu',
 'Chenming Zhu',
 'Yixun Liang',
 'Wenliang Zhao',
 'Lingni Ma',
 'Wei Yin',
 'Hyo Jin Kim',
 'John Onofrey',
 'Xihui Liu',
 'Xumin Yu',
 'Jiwen Lu']

In [48]:
nlp = spacy.load('en_core_web_lg')

# Function to extract potential author name using NER and regex
def extract_author_ner(query):
    doc = nlp(query)
    # Loop over entities to find PERSON entities (likely to be authors)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text.strip()  # Return the first person name found
    return None

In [49]:
extract_author_ner(query)

'Yuexi Du'

In [51]:
extracted_authors = extract_author_ner(query)

In [52]:
def fuzzy_match_author(extracted_author, unique_authors, threshold=80):
    # Use fuzzy matching to find the best match
    matches = process.extractBests(extracted_author, unique_authors, score_cutoff=threshold)
    return matches

In [53]:
fuzzy_match_author(extracted_authors,unique_authors)

[('Yuexi Du', 100)]

In [58]:
def retrieve_from_pinecone(query_text: str, top_k=5, author_name: str = None):
    # Initialize Pinecone
    pc = Pinecone(api_key=api_key)  # Update with your API key and environment
    index_name = 'embeddings5'

    # Connect to the existing index
    index = pc.Index(index_name)

    # Embed the query text using the same model used for upserts
    query_embedding = model.encode(query_text).tolist()  # Encode query into embedding

    # Build the metadata filter if an author name is provided
    metadata_filter = None
    if author_name:
        metadata_filter = {
            "Author": {"$in": author_name}  # Filter using the $in operator for the author name
        }

    # Perform similarity search in Pinecone with metadata filtering
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_values=True,
        include_metadata=True,
        filter=metadata_filter  # Apply the metadata filter
    )

    # Display the results
    print(f"Top {top_k} results for the query '{query_text}' with author filtering:")
    for i, match in enumerate(results['matches']):
        print(f"\nResult {i+1}:")
        print(f"  - ID: {match['id']}")
        print(f"  - Score: {match['score']}")
        print(f"  - Metadata: {match.get('metadata', {})}")

    return results

In [68]:
author_name = "Yuexi Du"
query = "give the research paper by published on september"
# Retrieve and filter results by author
retrieve_from_pinecone(query, top_k=10, author_name=author_name)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Mon, 30 Sep 2024 22:41:12 GMT', 'Content-Type': 'application/json', 'Content-Length': '137', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '1', 'x-pinecone-request-id': '3019207225874154891', 'x-envoy-upstream-service-time': '1', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"the $in operator must be followed by a list of strings or a list of numbers, got \"Yuexi Du\" instead","details":[]}


In [67]:
if extracted_authors:
    print(f"Extracted Author from query: {extracted_authors}")
    
    # List of authors from vector store metadata (already fetched)
    
    
    # Perform fuzzy matching to find the closest match
    matched_authors = fuzzy_match_author(extracted_authors, unique_authors)
    
    if matched_authors:
        # Extract matched author names (ignore the score)
        matched_author_names = [match[0] for match in matched_authors]
        
        print(f"Matched authors: {matched_author_names[0]}")
        
        # Perform metadata filtering in the vector store using the matched author(s)
        # Assuming you have a `Pinecone` index connected already
        retrieve_from_pinecone(query, matched_author_names)
    else:
        print("No matching authors found.")
else:
    print("No author found in the query.")

Extracted Author from query: Yuexi Du
Matched authors: Yuexi Du


TypeError: '>' not supported between instances of 'str' and 'int'

In [69]:
def retrieve_from_pinecone(query_text: str, top_k=5, author_name: list = None):  # Expect a list for author_name
    # Initialize Pinecone
    pc = Pinecone(api_key=api_key)  # Update with your API key and environment
    index_name = 'embeddings5'

    # Connect to the existing index
    index = pc.Index(index_name)

    # Embed the query text using the same model used for upserts
    query_embedding = model.encode(query_text).tolist()  # Encode query into embedding

    # Build the metadata filter if an author name is provided
    metadata_filter = None
    if author_name and isinstance(author_name, list):  # Ensure it's a list
        metadata_filter = {
            "Author": {"$in": author_name}  # Filter using the $in operator for the author name(s)
        }

    # Perform similarity search in Pinecone with metadata filtering
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_values=True,
        include_metadata=True,
        filter=metadata_filter  # Apply the metadata filter if it exists
    )

    # Display the results
    print(f"Top {top_k} results for the query '{query_text}' with author filtering:")
    for i, match in enumerate(results['matches']):
        print(f"\nResult {i+1}:")
        print(f"  - ID: {match['id']}")
        print(f"  - Score: {match['score']}")
        print(f"  - Metadata: {match.get('metadata', {})}")

    return results


# Example workflow after extracting authors
if extracted_authors:
    print(f"Extracted Author from query: {extracted_authors}")
    
    # List of authors from vector store metadata (already fetched)
    # unique_authors is assumed to be defined

    # Perform fuzzy matching to find the closest match
    matched_authors = fuzzy_match_author(extracted_authors, unique_authors)
    
    if matched_authors:
        # Extract matched author names (ignore the score)
        matched_author_names = [match[0] for match in matched_authors]
        
        print(f"Matched authors: {matched_author_names}")
        
        # Perform metadata filtering in the vector store using the matched author(s)
        # Pass the matched_author_names as a list
        retrieve_from_pinecone(query, matched_author_names)
    else:
        print("No matching authors found.")
else:
    print("No author found in the query.")

Extracted Author from query: Yuexi Du
Matched authors: ['Yuexi Du']


TypeError: '>' not supported between instances of 'str' and 'int'

In [72]:
import json

In [84]:
def retrieve_from_pinecone(query_text: str, top_k=5, author_name: list = None):  
    # Initialize Pinecone
    pc = Pinecone(api_key=api_key)  # Update with your API key and environment
    index_name = 'embeddings5'

    # Connect to the existing index
    index = pc.Index(index_name)

    # Embed the query text using the same model used for upserts
    query_embedding = model.encode(query_text).tolist()  # Encode query into embedding

    # Build the metadata filter if an author name is provided
    metadata_filter = None
    if author_name and isinstance(author_name, list):  # Ensure it's a list
        metadata_filter = {
            "Author": {"$in": author_name}  # Filter using the $in operator for the author name(s)
        }

    # Perform similarity search in Pinecone with metadata filtering
    try:
        results = index.query(
            vector=query_embedding,
            top_k=top_k,
            include_values=True,
            include_metadata=True,
            filter=metadata_filter  # Apply the metadata filter if it exists
        )
    except Exception as e:
        print(f"Error during Pinecone query: {e}")
        return

    # Display the results
    print(f"Top {top_k} results for the query '{query_text}' with author filtering:")
    for i, match in enumerate(results['matches']):
        print(f"\nResult {i+1}:")
        print(f"  - ID: {match['id']}")
        print(f"  - Score: {match['score']}")
        print(f"  - Metadata: {match.get('metadata', {})}")

    return results

# Example workflow after extracting authors
if extracted_authors:
    print(f"Extracted Author from query: {extracted_authors}")
    
    # List of authors from vector store metadata (already fetched)
    # unique_authors is assumed to be defined

    # Perform fuzzy matching to find the closest match
    matched_authors = fuzzy_match_author(extracted_authors, unique_authors)

    print(f"Matched authors output: {matched_authors}")  # Debugging print statement
    print(f"Type of matched_authors: {type(matched_authors)}")

    if matched_authors:
        # Ensure only author names (strings) are extracted
        matched_author_names = [match[0] for match in matched_authors if isinstance(match[0], str)]

        print(f"Matched authors (only names): {json.dumps(matched_author_names)}")
        
        # Perform metadata filtering in the vector store using the matched author(s)
        results =retrieve_from_pinecone(query, author_name=matched_author_names)
    else:
        print("No matching authors found.")
else:
    print("No author found in the query.")

Extracted Author from query: Yuexi Du
Matched authors output: [('Yuexi Du', 100)]
Type of matched_authors: <class 'list'>
Matched authors (only names): ["Yuexi Du"]
Top 5 results for the query 'give the research paper by published on september' with author filtering:

Result 1:
  - ID: C:\QpiAi\paper_2409.18119.pdf:8:0
  - Score: 0.131281778
  - Metadata: {'Author': ['Yuexi Du', 'John Onofrey', 'Nicha C. Dvornek'], 'publication_date': '2024-09', 'text': 'Table 4: Ablation of Model Design We ablate different model designs on the EMBED [ 21] BI-RADS\nprediction task and report balanced accuracy (bACC) and AUC score. The best and second best\nresults are highlighted in bold and underlined, respectively. Our full method is shaded in gray.\nMethods EMBED BI-RADS [21]\nSLA Symm. LV T LV V PEFT-LLMZero-shot Linear Classification Full Fine-tune\nbACC (%) AUC (%) bACC (%) AUC (%) bACC (%) AUC (%)\n✓ ✓ ✓ 29.28 71.16 38.71 77.50 30.55 70.69\n✓ ✓ ✓ 31.03 72.79 39.57 77.39 39.47 76.23\n✓ ✓ ✓ 27.32 

In [99]:
[Author['id'] for Author in results['matches']]

['C:\\QpiAi\\paper_2409.18119.pdf:8:0',
 'C:\\QpiAi\\paper_2409.18119.pdf:0:2',
 'C:\\QpiAi\\paper_2409.18119.pdf:20:0',
 'C:\\QpiAi\\paper_2409.18119.pdf:11:1',
 'C:\\QpiAi\\paper_2409.18119.pdf:20:1']

In [100]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

PineconeApiAttributeError: QueryResponse has no attribute '0' at ['['received_data']']