In [23]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ['OPENAI_API_KEY']=os.getenv("OPENAI_API_KEY")

In [24]:

from langchain_community.document_loaders import PyPDFLoader

pdf_folder_path = '/Users/satish/Desktop/Llamaindex/Good_papers_CNN_NLP'

# List all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith('.pdf')]

# Load each PDF file
docs = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(os.path.join(pdf_folder_path, pdf_file))
    docs.extend(loader.load())

In [25]:
docs

[Document(metadata={'source': '/Users/satish/Desktop/Llamaindex/Good_papers_CNN_NLP/Tutorial_paper.pdf', 'page': 0}, page_content='LORA: L OW-RANK ADAPTATION OF LARGE LAN-\nGUAGE MODELS\nEdward Hu∗Yelong Shen∗Phillip Wallis Zeyuan Allen-Zhu\nYuanzhi Li Shean Wang Lu Wang Weizhu Chen\nMicrosoft Corporation\n{edwardhu, yeshe, phwallis, zeyuana,\nyuanzhil, swang, luw, wzchen }@microsoft.com\nyuanzhil@andrew.cmu.edu\n(Version 2)\nABSTRACT\nAn important paradigm of natural language processing consists of large-scale pre-\ntraining on general domain data and adaptation to particular tasks or domains. As\nwe pre-train larger models, full ﬁne-tuning, which retrains all model parameters,\nbecomes less feasible. Using GPT-3 175B as an example – deploying indepen-\ndent instances of ﬁne-tuned models, each with 175B parameters, is prohibitively\nexpensive. We propose Low-RankAdaptation, or LoRA, which freezes the pre-\ntrained model weights and injects trainable rank decomposition matrices into ea

In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size =1000,chunk_overlap = 200)
documents = text_splitter.split_documents(docs)

In [27]:
## Vector EMbedding and vector store using OPENAI
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(documents[:20],OpenAIEmbeddings())

## USe that free LLM model for this

In [28]:
## Vector EMbedding and vector store using OLLAMA 
#from langchain_community.embeddings import OllamaEmbeddings
#from langchain_community.vectorstores import Chroma
#db = Chroma.from_documents(documents[:20],OllamaEmbeddings())

In [29]:
## USig  FAISS as a vector store

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(documents[:20],OllamaEmbeddings())

In [30]:
query = "Give me the paragraph headings"
result = db.similarity_search(query)
result[0].page_content

'critical deployment challenge for GPT-3 (Brown et al., 2020) with\n175 billion trainable parameters.1\nMany sought to mitigate this by adapting only some parameters or\nlearning external modules for new tasks. This way, we only need\nto store and load a small number of task-speciﬁc parameters in ad-\ndition to the pre-trained model for each task, greatly boosting the\noperational efﬁciency when deployed. However, existing techniques\n∗Equal contribution.\n0Compared to V1, this draft includes better baselines, experiments on GLUE, and more on adapter latency.\n1While GPT-3 175B achieves non-trivial performance with few-shot learning, ﬁne-tuning boosts its perfor-\nmance signiﬁcantly as shown in Appendix A.\n1arXiv:2106.09685v2  [cs.CL]  16 Oct 2021'

In [31]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model = "gpt-4o-mini")
#from langchain_community.llms import Ollama
#llm = Ollama(model = "llama2")
#llm

In [41]:
#Designing ChatPrompt Template

from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
        First go through all the research papers and integrate with the training data of chatgpt to give the
        precise answer. 
    <context>
    {context}
    </context>
    Question: {input}""")

In [42]:
#Chain Implementation

from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(llm, prompt)

In [43]:
#Retriever
retriever= db.as_retriever()

In [44]:
#Retrieval Chain
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(retriever,document_chain)

In [45]:
response = retrieval_chain.invoke({"input": "Suppose I have an output data of Sentiment Analysis and I want to integrate the sentiment inference with trend data. How can I do that"})

In [46]:
response['answer'].split('\n')

['Integrating sentiment analysis output with trend data can be achieved through a multi-step approach. Here’s a structured way to approach this integration:',
 '',
 '1. **Data Preparation**:',
 '   - **Sentiment Analysis Output**: Ensure that your sentiment analysis data is structured, typically in a format that includes the text (e.g., reviews, comments), sentiment score (positive, negative, neutral), and any relevant metadata (timestamp, user ID, etc.).',
 '   - **Trend Data**: Gather trend data relevant to your analysis. This could include time-series data related to social media trends, sales figures, stock prices, or any other relevant metrics that fluctuate over time.',
 '',
 '2. **Time Alignment**:',
 '   - **Timestamps**: If your sentiment data has timestamps, align it with your trend data. This could involve aggregating sentiment scores over specific time intervals (e.g., daily, weekly) to match the frequency of your trend data.',
 '   - **Resampling**: If the sentiment data i