In [None]:
!pip install llama_index openai

In [None]:
!pip install pypdf

In [None]:
!pip install torch sentence-transformers

In [4]:
from google.colab import userdata
import openai
openai.api_key = userdata.get("OPENAI_API_KEY")

# Load all the pages of the PDF using LLmaIndex SimpleDirectoryReader

In [5]:
from llama_index import SimpleDirectoryReader
reader = SimpleDirectoryReader(input_files = ["/content/gradschooltalk.pdf"])
documents = reader.load_data()

In [7]:
print(len(documents))
print(type(documents[0]))
print(documents[0].text)

22
<class 'llama_index.schema.Document'>
Applying to Ph.D. Programsin Computer Science
MorHarchol-Balter
ComputerScience Department
CarnegieMellonUniversity
Lastupdated2014
1 Introduction
This document is intended for people applying to Ph.D. progr ams in computer
scienceorrelatedareas. Thedocumentisinformalinnature andismeanttoexpress
only the opinions of the author. The author is a professor of c omputer science
at CMU, and has been involved in the Ph.D. admissions process at CMU, U.C.
Berkeley, and MIT.
Please direct any further questions you have after reading t his document
to our Admissions Coordinator (applyweb@cs.cmu.edu). Do n ot send email
to theauthorof this document.
Contents
1 Introduction 1
2 DoI really wanta Ph.D.? What doesaPh.D.entail? 2
2.1 What is aPh.D.? . . . . . . . . . . . . . . . . . . . . . . . . . . 2
2.2 Lackof emphasis oncourses . . . . . . . . . . . . . . . . . . . . 2
2.3 Theresearch process and advisor/advisee relationship s . . . . . . 3
2.4 Frustrations

# Stitch all the pages of the loaded document into a single Document

In [8]:
from llama_index import Document
document = Document(text = "\n\n".join([doc.text for doc in documents]))

# Create an instance of the Llama Index Sentence Window Node parser
This node parser is resposible for splitting the document into granular chunk of nodes i.e. sentences.

In [9]:
from llama_index.node_parser import SentenceWindowNodeParser
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
                          window_size = 5,
                          window_metadata_key = "window",
                          original_text_metadata_key= "original_text"
                        )

### A small demonstration of the sentence window node parser

In [10]:
text = "Hello! This is Sinngam. I am from Imphal. Are you happy?"
nodes = sentence_node_parser.get_nodes_from_documents([Document(text = text)])

In [None]:
print([x.text for x in nodes])

['Hello! ', 'This is Sinngam. ', 'I am from Imphal. ', 'Are you happy?']


In [11]:
nodes[1]

TextNode(id_='d55c8bcb-8406-43a8-9ce4-62bd705f2d04', embedding=None, metadata={'window': 'Hello!  This is Sinngam.  I am from Imphal.  Are you happy?', 'original_text': 'This is Sinngam. '}, excluded_embed_metadata_keys=['window', 'original_text'], excluded_llm_metadata_keys=['window', 'original_text'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d160184c-9b07-4237-8953-4d4758fef2b9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='c1b8fe021572543e108a048e09189d25028bb9e55aabecf1dd6a5268f8dbbc35'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='16b62b1b-d952-47ac-9ad4-eb17a9c2cc4c', node_type=<ObjectType.TEXT: '1'>, metadata={'window': 'Hello!  This is Sinngam.  I am from Imphal.  Are you happy?', 'original_text': 'Hello! '}, hash='c54e3de2e52d6a38e229ab559b84cc9b145710df02d7f9c3f3cdc267251ab1b3'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='5d30b630-1b09-44b5-89fc-fd5b7ff1d703', node_type=<ObjectType.TEXT: '1'>, metadata={'wi

In [12]:
nodes[1].metadata

{'window': 'Hello!  This is Sinngam.  I am from Imphal.  Are you happy?',
 'original_text': 'This is Sinngam. '}

In [None]:
nodes[1].metadata["window"]

'Hello!  This is Sinngam.  I am from Imphal.  Are you happy?'

# Create an instance of the language model used

In [13]:
from llama_index.llms import OpenAI
llm = OpenAI(model = "gpt-3.5-turbo", temperature = 0.1)

# Create an instance of the llama index ServiceContext.
This wraps up all the components required for indexing such as llm, embedding model and the node parser.

In [14]:
from llama_index import ServiceContext
sentence_context = ServiceContext.from_defaults(
                    llm=llm,
                    embed_model = "local:BAAI/bge-small-en-v1.5",
                    node_parser=sentence_node_parser
                  )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

# Create a vector index. This allows us to perfom vector similarity search.

In [15]:
from llama_index import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document], service_context=sentence_context
)

In [16]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")

# Create an instance of the MetadataReplacementPostProcessor.
This post processor is specially made for performing Sentence Window Retrieval

In [17]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

# Create an instance of the SentenceTransformerRerank.
This post processor assigns new similarity scores to the retrieved chunks.<br>
eg. Retrieved 6 most relevant chunks using vector similarity search<br> -> Assign new similarity scores using the reranker <br> -> Retrieve 2 chunks with the highest relevance scores among the 6 chunks.

In [18]:
from llama_index.indices.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n=2, model="BAAI/bge-reranker-base"
)

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

# Create a qury engine using the VectorStoreIndex created and pass the node post processors

In [19]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc, rerank]
)

In [20]:
window_response = sentence_window_engine.query("How do I decide if I should even get a PHD?")

In [None]:
from llama_index.response.notebook_utils import display_response

display_response(window_response)

**`Final Response:`** Deciding whether or not to pursue a Ph.D. requires careful consideration. It is important to understand that a Ph.D. is not for everyone and requires a particular type of personality. You need to be someone who is obsessed with solving problems, have tremendous perseverance, and be willing to put in hard work. Additionally, you need to have a clear vision and ideas, as well as the ability to express yourself. If you are unsure about pursuing a Ph.D., working in a research or industrial lab that involves research for a few years can help you make a more informed decision.