In [None]:
!pip install langchain
!pip install pypdf
!pip install unstructred
!pip install pinecone-client
!pip install llama-cpp-python
!pip install huggingface_hub
!pip install sentence_transformers

In [None]:
from langchain.document_loaders import PyPDFLoader,OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os


In [None]:
## Load the data

loader = PyPDFLoader("/content/datascience.pdf")


In [None]:
data = loader.load()

In [None]:
#split the text in to chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=0)
docs = text_splitter.split_documents(data)

In [None]:
len(docs)

383

In [None]:
#setup environment
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "hf_xxxxxx" # Your Hugging face key
PINCONE_API_KEY = "05xxxxxxxxx" # Your pinecone API Key
PINCONE_API_ENV = "xxxxx" # Your pinecone Env name

In [None]:
# Download embedding
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
#initializing pinecone
pinecone.init(
    api_key=PINCONE_API_KEY, #os.environ.get('PINECONE_API_KEY)
    environment=PINCONE_API_ENV
)
pinecone.info.version()



VersionResponse(server='2.0.11', client='2.2.2')

In [None]:
pinecone.list_indexes()

['langchainpinecone']

In [None]:
index_name = "langxxxxxxxx" # Pineconde Index name

In [None]:
if index_name not in pinecone.list_indexes():
   pinecone.create_index(index_name,dimension=384,metric='cosine',pods=1,pod_type='p1.x2')

In [None]:
index = pinecone.Index(index_name)
index.describe_index_stats()


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
docserch = Pinecone.from_texts([t.page_content for t in docs],embeddings,index_name=index_name)

In [None]:
# if already have index
#docserch = Pinecone.from_existing_index(index_name,embeddings)

query = "YOLOv7 outperforms which models"
docs = docserch.similarity_search(query)

In [None]:
docs

[Document(page_content='Online models have both advantages and disadvantages. /T_hey \ndynamically evolve over time, meaning they only require a single \ndeployment into a production setting. /T_he fact that these models do \nnot have the entire dataset available when being trained, however, \nis a challenge. /T_hey must make assumptions about the data based \nTRAINING STYLE\nOfﬂine Online Unsupervised SupervisedLEARNING STYLE\nAnalytic Learning Models\n51 Take off the Training Wheels', metadata={}),
 Document(page_content='logistic regression models.Applied in cases where the objective \nfunction is not completely differentiable \nwhen using sub-gradients.Witten, Ian H., Eibe Frank, \nand Mark A. Hall. Data Mining: \nPractical Machine Learning Tools \nand Techniques. Massachusetts: \nMorgan Kaufmann, 2011. Print.\nSupport Vector \nMachinesProjection of feature vectors \nusing a kernel function into \na space where classes are \nmore separable.Try multiple kernels and use k-fold cross'

In [None]:
!pip install openai


Collecting openai
  Downloading openai-0.27.9-py3-none-any.whl (75 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/75.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m71.7/75.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.9


In [None]:
import openai
openai.api_key = "sk-xxxxxxxxxxxxx" # Your OpenAI API Key


from langchain.llms import OpenAI
llm = OpenAI(openai_api_key=openai.api_key,model_name='text-davinci-003',max_tokens=512)
print (llm)

[1mOpenAI[0m
Params: {'model_name': 'text-davinci-003', 'temperature': 0.7, 'max_tokens': 512, 'top_p': 1, 'frequency_penalty': 0, 'presence_penalty': 0, 'n': 1, 'request_timeout': None, 'logit_bias': {}}


In [None]:
output = llm('explain quantum mechanics in one sentence')

In [None]:
output


'\n\nQuantum mechanics is a physical theory describing the behavior of matter and energy on the atomic and subatomic level.'