In [2]:
import os
import openai
import sys
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
import numpy as np

In [3]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

openai.api_key  = os.environ['OPENAI_API_KEY']


In [4]:
#Loading the PDF - vuln
#loader = PyPDFLoader("/Users/stevenslater/Desktop/FinalProject/VulnGuidelines.pdf")

#loading my CV
loader = PyPDFLoader("/Users/stevenslater/Desktop/CV.pdf")


In [5]:
pages = loader.load()

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 18 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)


In [6]:
len(pages)


1

In [7]:
page = pages[0]

In [8]:
print(page.page_content[500:1000])

ity classification, using Chain-of Thought Prompting and a newly proposed Prompt Optimisation framework. University of British Colombia – Exchange Year       Vancouver    2022-2023 • Relevant Courses: Artificial Intelligence, Functional Programming, Applied Machine Learning. • Received 2nd Highest Mark in the year for Artificial Intelligence. Hutchesons Grammar School            Glasgow        2007-2019 • Advanced Higher: Computing Science (A), Business Management (A), Mathematics (A)  • Higher:


**Chunking Text**


In [9]:
#Using recusrive splitter (played about with chunks will come back to this)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    separators=["\n\n", "\n", "\. ", " ", ""]
)

In [10]:
splits = r_splitter.split_documents(pages)

In [11]:
print(len(splits))

13


**Embeddings**

In [12]:
embedding = OpenAIEmbeddings()

test1 = splits[1].page_content
test2 = splits[2].page_content
print(test1)
print(test2)
embedding1 = embedding.embed_query(test1)
embedding2 = embedding.embed_query(test2)

#Checking similairities between two splits fom the PDF, will use this in retrieval later.
np.dot(embedding1, embedding2)

Graduation 2024 – Projected 1st • Relevant courses: Algorithms and Data Structures, Data Science, Relational Databases, Object-Oriented Programming. • Final Year project: Leveraging Large Language Models for vulnerability classification, using Chain-of Thought Prompting and a newly proposed
a newly proposed Prompt Optimisation framework. University of British Colombia – Exchange Year       Vancouver    2022-2023 • Relevant Courses: Artificial Intelligence, Functional Programming, Applied Machine Learning. • Received 2nd Highest Mark in the year for Artificial Intelligence. Hutchesons


0.8247618940780516

**Vector Store**

In [13]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/chroma/'

In [14]:
!rm -rf ./docs/chroma 

In [15]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [16]:
print(vectordb._collection.count())

13


In [17]:
vectordb.persist()

**Retrevial**

In [None]:
#Retriaval
#MMR - MAXIMAL MARGINAL RELEVANCE
# may not always want  most similar documments, also if have duplicated data can help filter this out
#Algorithm:
#Query Vector store
#Choose a fetch_k most similar responses
#Within this choose the k most diverse



In [36]:
#Simialrity search
vectordb.similarity_search(question,k=2)


[Document(page_content="– Technology Summer Intern     Glasgow        Summer 2023 • Developed and deployed a full-stack web application using Python’s Flask framework and JavaScript, leveraging cloud computing to stream real-time diagnostic data from Siemens' Internet of Things (IoT) rail network to all employees on any", metadata={'page': 0, 'source': '/Users/stevenslater/Desktop/CV.pdf'}),
 Document(page_content='Hutchesons Grammar School            Glasgow        2007-2019 • Advanced Higher: Computing Science (A), Business Management (A), Mathematics (A)  • Higher: Computing Science (A), Mathematics (A), Physics (A), Business Management (A) English (B)  PROFESSIONAL EXPERIENCE          Siemens – Technology', metadata={'page': 0, 'source': '/Users/stevenslater/Desktop/CV.pdf'})]

In [51]:
#MMR
mmr=vectordb.max_marginal_relevance_search(question,fetch_k=3, k=2)

In [None]:
#Self Query (LLM aided retrieval)
#Uses the LLM to split the question into a filter and a search term
#good when you want the filter to be on METADATA - Use when making documents in pipeline bigger or using database for RAG.

In [None]:
#Compression Techniques - come back when making pipeline for larger documents

**Answering**

In [18]:
llm_name = "gpt-3.5-turbo"
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [19]:
#using RetrievalQA chain
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True
)

#TO build own retreiver build using CustomedRetriever class

In [36]:
newquestion="what company did I work for in 2023"

In [37]:
result = qa_chain({"query": newquestion})

In [38]:
result["result"]

"In 2023, you worked for Siemens, where you developed and deployed a full-stack web application using Python's Flask framework and JavaScript to stream real-time diagnostic data from Siemens' Internet of Things (IoT) rail network to all employees."

**Chat**

In [39]:
#adding memory
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [46]:
#new QA Chain
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [49]:
question = "What did Steven do at Siemens?"
result = qa({"question": question})

In [50]:
result["answer"]

"At Siemens, Steven developed and deployed a full-stack web application using Python's Flask framework and JavaScript. He leveraged cloud computing to stream real-time diagnostic data from Siemens' Internet of Things (IoT) rail network to all employees."

In [51]:
question = "Did I use Amazon Web services when I was there?"
result = qa({"question": question})

In [52]:
result["answer"]

"Yes, the individual mentioned that they developed and deployed a full-stack web application at Siemens, leveraging cloud computing to stream real-time diagnostic data from Siemens' Internet of Things (IoT) rail network to all employees on any device. The app was powered by Amazon Web Services, utilizing AWS Relational Database Service and EC2 Instances for the backend database and web server."