In [None]:
!pip install openai transformers pdfplumber numpy pinecone-client

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.5-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.6/71.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfplumber
  Downloading pdfplumber-0.9.0-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pinecone-client
  Downloading pinecone_client-2.2.1-py3-none-any.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.2/177.2 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manyli

In [None]:
import os
import openai
openai.organization = "<your_organization_key>"
openai.api_key = "<your_api_key>"

In [None]:
import pdfplumber

pdf_path = "essayonessaywriting.pdf"
text = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text()

def split_text_into_chunks(text, chunk_size=100):
    words = text.split()
    text_chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i+chunk_size])
        text_chunks.append(chunk)
    return text_chunks

text_chunks = split_text_into_chunks(text)


In [None]:
import numpy as np

def create_embedding(chunk):
    response = openai.Embedding.create(
            input=chunk,
            model="text-embedding-ada-002"
        )
    return response["data"][0]["embedding"]

def create_embeddings(text_chunks):
    embeddings = []

    for chunk in text_chunks:
        response = openai.Embedding.create(
            input=chunk,
            model="text-embedding-ada-002"
        )
        embedding = response["data"][0]["embedding"]
        embeddings.append(embedding)

    return np.array(embeddings)

embeddings = create_embeddings(text_chunks)

In [None]:
import pinecone

pinecone.init(api_key="<pinecone_api_key>", environment="us-east4-gcp")
index_name = "semanticsearch"
pinecone.create_index(name=index_name, metric="cosine", dimension=1536)


In [None]:
import uuid
pinecone_index = pinecone.Index(index_name=index_name)

for chunk in text_chunks:
    id = uuid.uuid4().hex
    embedding = create_embedding(chunk)
    pinecone_index.upsert([(id, embedding, {
                "text": chunk})])

In [None]:
query = "how to relate ideas?"
response = openai.Embedding.create(
    input=query,
    model="text-embedding-ada-002"
)
query_embedding = response["data"][0]["embedding"]

In [None]:
num_results = 5
index = pinecone.Index("semanticsearch")
results = index.query(query_embedding, k=num_results, top_k=5, include_metadata=True)
print("Search results for the query:", query)
for match in results["matches"]:
    print(f"Text chunk: {match['metadata']['text']}")
    print(f"Similarity score: {match['score']}")
    print("\n")


Search results for the query: how to relate ideas?
Text chunk: other topic and then coming back to the first. If you use such words as ‘therefore’, make sure that what you say next really does follow from what you have just said. Think about your audience: who are you writing for? It is often better to think of your reader not as your lecturer or tutor, but as someone who may not know very much about the question. If you have your tutor in mind as the reader, it is all too easy to fall into the trap of thinking, ‘If it’s not quite clear, Dr so-and-so will understand what
Similarity score: 0.819455445


Text chunk: I’m trying to say and fill in the gaps.’ If, however, you think of yourself as having to explain a topic to another student on the course who perhaps hasn’t done the same reading as you, it forces you to think more carefully about how to introduce and explain the topic. At various strategic points in the essay, remind your reader where you are in your overall structure by usi