In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.messages import HumanMessage
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory

from openai import OpenAI

from langchain_chroma import Chroma

from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

In [3]:
MODEL = "text-embedding-3-small"
llm = ChatOpenAI(model="gpt-3.5-turbo")
client = OpenAI( api_key=OPENAI_API_KEY)

In [4]:
loader = PyPDFLoader("/Users/luisbarajas/Documents/AGI/Papers/imagenet-classification-with-deep-convolutional-neural-networks.pdf")
data = loader.load()
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 9 document(s) in your data
There are 3461 characters in your document


In [8]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(data)
vectorstore = Chroma.from_documents(documents=texts, embedding=OpenAIEmbeddings(model=MODEL))
retriever = vectorstore.as_retriever()

print(len(texts))
for(text) in texts:
    print(text.page_content)


47
ImageNet Classiﬁcation with Deep Convolutional
Neural Networks
Alex Krizhevsky
University of Toronto
kriz@cs.utoronto.caIlya Sutskever
University of Toronto
ilya@cs.utoronto.caGeoffrey E. Hinton
University of Toronto
hinton@cs.utoronto.ca
Abstract
We trained a large, deep convolutional neural network to classify the 1.2 million
high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 dif-
ferent classes. On the test data, we achieved top-1 and top-5 error rates of 37.5%
and 17.0% which is considerably better than the previous state-of-the-art. The
neural network, which has 60 million parameters and 650,000 neurons, consists
of ﬁve convolutional layers, some of which are followed by max-pooling layers,
and three fully-connected layers with a ﬁnal 1000-way softmax. To make train-
ing faster, we used non-saturating neurons and a very efﬁcient GPU implemen-
tation of the convolution operation. To reduce overﬁtting in the fully-connected
ing faster, we used non-saturating 

In [9]:
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [11]:
# Retrieve and generate using the relevant snippets of the blog.
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks related to scientific papers, articles and investigations. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
print(question_answer_chain)
print(qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), config={'run_name': 'format_inputs'})
| ChatPromptTemplate(input_variables=['chat_history', 'context', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are an assistant for question-answering tasks related to scientific papers, articles and investigations. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n{context}")), MessagesPlaceholder(variable_name='chat_history'), HumanMessage

In [17]:
### Statefully manage chat history ###
store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        print(session_id)
        print(store)
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)
# print(conversational_rag_chain)


In [14]:
conversational_rag_chain.invoke(
    {"input": "what are the most relevant and important words in the paper"},
    config={
        "configurable": {"session_id": "abc124"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

'The most relevant and important words in the paper include "image annotation," "preprocessing," "neural network," "architecture," "convolutional," "fully-connected," "RGB pixel values," "down-sampling," "convolutional neural networks," and "large-scale image classification."'

"Section 2 of the paper discusses the preprocessing steps for images, including down-sampling to 256x256 resolution, resizing shorter sides, cropping central patches, and subtracting mean activity from pixels during training. The images were not pre-processed in any other way, and the network was trained on raw RGB pixel values.

In Section 3, the architecture of the network is outlined, featuring eight learned layers - five convolutional and three fully-connected. The section highlights novel aspects of the architecture, with subsections sorted by importance according to the authors' estimation.

The paper provides detailed information on image preprocessing and the architecture of the neural network used for image annotation tasks."
