In [1]:
import os
import time
import logging
import arxiv
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_qdrant import Qdrant
from langchain.embeddings import CohereEmbeddings
from typing import List, Optional
from dotenv import load_dotenv

In [2]:
load_dotenv()
os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY")

In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [21]:
def get_papers(query: str) -> str:
    # """ 
    # Downloads and processes papers from arXiv based on the query.

    # Args:
    #     query (str): The search query to fetch papers.

    # Returns:
    #     str: The concatenated content of all papers.
  
    # Replace spaces with underscores in the query to create a valid directory name
        # Set up the directory path relative to the current working directory
    base_dir = os.getcwd()
    dirpath = os.path.join(base_dir, f"arxiv_papers_for_{query.replace(' ', '_')}")

    if not os.path.exists(dirpath):
        os.makedirs(dirpath)


    # Initialize arxiv client and search for papers
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=2,
        sort_order=arxiv.SortOrder.Descending
    )

   # Download and save the papers
    for result in client.results(search):
        while True:
            try:
                paper_id = result.get_short_id()
                # Truncate and sanitize title to avoid overly long filenames
                title = result.title.replace(' ', '_').replace('/', '_').replace(':', '').replace('?', '')[:30]
                filepath = os.path.join(dirpath, f"{paper_id}_{title}.pdf")
                result.download_pdf(dirpath=dirpath, filename=f"{paper_id}_{title}.pdf")
                logging.info(f"-> Paper id {paper_id} with title '{result.title}' is downloaded.")
                break
            except (FileNotFoundError, ConnectionResetError) as e:
                logging.error(f"Error occurred: {e}")
                time.sleep(5)
            except Exception as e:
                logging.error(f"An unexpected error occurred: {e}")
                break
    return dirpath

In [22]:
 paers = get_papers("AI In defence")

2024-07-06 21:27:14,797 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=AI+In+defence&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2024-07-06 21:27:21,659 - INFO - Got first page: 100 of 2346457 total results
2024-07-06 21:27:26,526 - INFO - -> Paper id 2112.01252v2 with title 'Australia's Approach to AI Governance in Security and Defence' is downloaded.
2024-07-06 21:27:29,897 - INFO - -> Paper id 1809.11089v1 with title 'A Systems Approach to Achieving the Benefits of Artificial Intelligence in UK Defence' is downloaded.


'c:\\github\\openai\\ResearchRover\\expriments\\arxiv_papers_for_AI_In_defence'

In [24]:
def load_papers(dirpath):
    papers = []
    loader = DirectoryLoader(dirpath, glob="./*.pdf", loader_cls=PyPDFLoader)
    try:
        papers = loader.load()
    except Exception as e:
        logging.error(f"Error loading files: {e}")

    logging.info(f"Total number of pages loaded: {len(papers)}")

    # Concatenate all pages' content into a single string
    full_text = ''.join(paper.page_content for paper in papers)

    # Remove empty lines and join lines into a single string
    full_text = " ".join(line for line in full_text.splitlines() if line)

    return full_text
        

In [25]:
text = load_papers(paers)

2024-07-06 21:28:47,834 - INFO - Total number of pages loaded: 66


In [26]:
text 



In [57]:
def get_papers(query: str) -> str:
    # """ 
    # Downloads and processes papers from arXiv based on the query.

    # Args:
    #     query (str): The search query to fetch papers.

    # Returns:
    #     str: The concatenated content of all papers.
  
    # Replace spaces with underscores in the query to create a valid directory name
        # Set up the directory path relative to the current working directory
    base_dir = os.getcwd()
    dirpath = os.path.join(base_dir, f"arxiv_papers_for_{query.replace(' ', '_')}")

    if not os.path.exists(dirpath):
        os.makedirs(dirpath)


    # Initialize arxiv client and search for papers
    client = arxiv.Client()
    search = arxiv.Search(
        query=query,
        max_results=2,
        sort_order=arxiv.SortOrder.Descending
    )

   # Download and save the papers
    for result in client.results(search):
        while True:
            try:
                paper_id = result.get_short_id()
                # Truncate and sanitize title to avoid overly long filenames
                title = result.title.replace(' ', '_').replace('/', '_').replace(':', '').replace('?', '')[:30]
                filepath = os.path.join(dirpath, f"{paper_id}_{title}.pdf")
                result.download_pdf(dirpath=dirpath, filename=f"{paper_id}_{title}.pdf")
                logging.info(f"-> Paper id {paper_id} with title '{result.title}' is downloaded.")
                break
            except (FileNotFoundError, ConnectionResetError) as e:
                logging.error(f"Error occurred: {e}")
                time.sleep(5)
            except Exception as e:
                logging.error(f"An unexpected error occurred: {e}")
                break
    papers = []
    loader = DirectoryLoader(dirpath, glob="./*.pdf", loader_cls=PyPDFLoader)
    try:
        papers = loader.load()
    except Exception as e:
        logging.error(f"Error loading files: {e}")

    logging.info(f"Total number of pages loaded: {len(papers)}")

    # Concatenate all pages' content into a single string
    full_text = ''.join(paper.page_content for paper in papers)

    # Remove empty lines and join lines into a single string
    full_text = " ".join(line for line in full_text.splitlines() if line)

    return full_text


In [58]:
text = get_papers("AI in Marketing")

2024-07-06 21:57:35,852 - INFO - Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=AI+in+Marketing&id_list=&sortBy=relevance&sortOrder=descending&start=0&max_results=100
2024-07-06 21:57:40,898 - INFO - Got first page: 100 of 2347097 total results
2024-07-06 21:57:44,155 - INFO - -> Paper id 2303.03174v1 with title 'Both eyes open: Vigilant Incentives help Regulatory Markets improve AI Safety' is downloaded.
2024-07-06 21:57:45,662 - INFO - -> Paper id 2308.02033v1 with title 'AI and the EU Digital Markets Act: Addressing the Risks of Bigness in Generative AI' is downloaded.
2024-07-06 21:57:48,398 - INFO - Total number of pages loaded: 46


In [59]:
text



In [51]:

def get_embeddings(full_text: str) -> None:
    # """Splits the text into chunks and creates a Qdrant vector store.

    # Args:
    #     full_text (str): The full text content of the papers.
    # """
    try:
        # Split the text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
        paper_chunks = text_splitter.create_documents([full_text])
    except Exception as e:
        logging.error(f"Error splitting text into chunks: {e}")
        return
    
    try:
        # Create Qdrant vector store
        qdrant = Qdrant.from_documents(
            documents=paper_chunks,
            embedding=CohereEmbeddings(model="embed-english-light-v3.0"),
            path="./db",
            collection_name="arxiv_papers",
        )
        return qdrant.as_retriever()
    except Exception as e:
        logging.error(f"Error creating Qdrant vector store: {e}")

In [52]:
retriver = get_embeddings(text)

2024-07-06 21:46:50,175 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-06 21:46:57,854 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-06 21:47:04,306 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-06 21:47:07,908 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"


In [53]:
print(retriver)

tags=['Qdrant', 'CohereEmbeddings'] vectorstore=<langchain_qdrant.vectorstores.Qdrant object at 0x0000021CBE7657E0>


In [54]:
query = "What is defence futute with AI"

In [56]:
# query = "What did the president say about Ketanji Brown Jackson"
retriver.invoke(query)

2024-07-06 21:47:29,484 - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"


[Document(metadata={'_id': 'c4a8534cdb544e868b99d81271324de2', '_collection_name': 'arxiv_papers'}, page_content='to expect it to be sustained indefinitely. UK  Defence must be able to quickly generate it when needed .  Additionally the system needs to be affordable , a by no  means insignificant factor in a time of strong fiscal head - winds for UK Defence.  Therefore, we need the ability to  evolve our AI system in an effective, timely and cost - effective manner.  These four elements are illustrated in  Figure 2.   Figure 2 : Four elements of the broad context of AI   2 AI immaturity: Defence Challenge to AI  Technical Capabilities   One challenge with the adoption of AI within Defence is  that many Defence tasks require AI capabilities which are  currently immature.  Examples inclu de [7, 8] : \uf0b7 Military decision -making within combat operations can  be characte rised as having a “high regret” if the “wrong”  decision is made; so requiring a high degree of trust in  any dec is

In [39]:
# def get_retriever() -> Optional[Qdrant]:
#     """Gets the retriever from the existing Qdrant collection.

#     Returns:
#         Optional[Qdrant]: The retriever if successful, otherwise None.
#     """
#     try:
#         qdrant = Qdrant.from_existing_collection(
#             embedding=CohereEmbeddings(model="embed-english-light-v3.0"),
#             collection_name="arxiv_papers"
#         )
#         return qdrant.as_retriever(k=5)
#     except Exception as e:
#         logging.error(f"Error getting retriever: {e}")
#         return None

In [41]:
retriver = get_retriever()

VectorStoreRetriever(tags=['Qdrant', 'CohereEmbeddings'], vectorstore=<langchain_qdrant.vectorstores.Qdrant object at 0x0000021CBE712F80>)

In [44]:
query = "What is defence futute with AI"
found_docs = retriver.similarity_search(query)

AttributeError: 'VectorStoreRetriever' object has no attribute 'similarity_search'