In [2]:
!pip install langchain langchain-community jq faiss-gpu sentence-transformers langchain-core

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from langchain_community.chat_models import BedrockChat
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import JSONLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory.buffer import ConversationBufferMemory
from langchain.chains.question_answering import load_qa_chain
import boto3
import os

In [4]:
# setting model ID -- can be found in bedrock 
## note: we are using a chat model
modelID = 'anthropic.claude-3-haiku-20240307-v1:0'

# creating client object to talk to models in bedrock using API
bedrock_client = boto3.client(
    service_name = "bedrock-runtime",
    region_name = "us-east-1"
)

# loading claude 3 haiku using bedrock client 
model = BedrockChat(
    model_id=modelID,
    client=bedrock_client
)

In [5]:
# this function loads the json data 
def load_data(json_path):
    loader = JSONLoader(file_path=json_path, jq_schema=".[]", text_content=False)
    documents = loader.load()
    return documents

# this function loads the json data into a vector database after using the all-mpnet-base-v2 model to embed the text         
def ss_search(documents, user_query):
    embedding_model = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
    db = FAISS.from_documents(documents, embedding_model)
    docs = db.similarity_search(user_query, k = 15)
    return docs

In [6]:
# decalre json path & run function with json path
json_path = "./ARXIV/merged_data_with_references.json"
documents = load_data(json_path = json_path)

def algoscholar_chat(user_query, documents):
    # complete similarity search between user query and vector space to identify which texts are most similar to the question
    context = ss_search(documents = documents, user_query = user_query)
    
    template = """
    You are an intelligent and helpful research assistant. Your name is AlgoScholar. You will work with the user to help them learn about any research topic or subject area they specify by using the context provided. 

    Here is a list of publications you should reference when answering the user: <arxiv> {context} </arxiv>

    Here are some important rules for the interaction:
    - Always stay in character as AlgoScholar, an AI Assistant from Fannie Mae 
    - If you are unsure how to respond, say 'Sorry, I didn't understand that. Could you repeat the question?'
    - If someone asks something irrelevant, say, 'Sorry, I am AlgoScholar and I help users identify academic publications for a certain topic. Do you have a reearch question today I can help you with?

    Here is an example of how to respond in a standard interaction:
    <example> 
    User: Hi, how were you created and what do you do?
    AlgoScholar: Hello! My name is AlgoScholar, and I was created by a team of technical folks to aid in the identification of academic research papers that pertain to a certain subject. What can I help you with today?
    </example>
    
    Here is the conversation history (between the user and you) prior to the question. It could be empty if there is no history:
    <history> {chat_history} </history>
    
    Here is the user's question: <question> {user_query} </question>

    How do you respond to the user's question? You will analyze the context provided, find the top three most relevant papers per the subject matter provided by the user, and generate a summary of the publication based on the abstract with the url included.

    Think about your answer before you respond. Put your response in <response></response> tags. 
    """

    #put template into PromptTemplate format to work wiht other langchain functions -- filling in the template with the dynamic variables (e.g., history, query, and context)
    prompt = PromptTemplate(
        input_variables=["chat_history", "user_query", "context"], 
        template=template
    )

    # create memory object using key identifiers in prompt 
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        input_key="user_query"
    )

    # create langchain chain
    chain = load_qa_chain(
        model, chain_type="stuff", memory=memory, prompt=prompt
    )
    
    response = chain({"input_documents": context, "user_query": user_query}, return_only_outputs=True)
    print(response['output_text'])

In [7]:
user_query = "Help me understand mechanisms to measure the amount of competition that exists"

algoscholar_chat(user_query = user_query, documents = documents)

  warn_deprecated(


<response>
To help understand mechanisms to measure the amount of competition that exists, I've identified the following three relevant research papers:

1. "International Trade Network: Country Centrality and COVID-19 Pandemic" (https://arxiv.org/abs/2107.14554)
This paper studies how the topology of the global trade network can explain the rate of COVID-19 diffusion and mortality across countries. The authors compute countries' centrality measures and apply community detection based on communicability distance. They find that the number of infections and fatalities are larger in countries with higher centrality in the global trade network.

2. "Strategic Energy Flows in Input-Output Relations: A Temporal Multilayer Approach" (https://arxiv.org/abs/2212.11585) 
This paper proposes a methodological approach for analyzing the reliability and resilience of energy systems by considering different types of embodied energy sources and the time evolution of sectors' and countries' interactio