# Repodialog notebook

In [36]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import streamlit as st
import pickle
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

## Load environment variables
Load environment variables from .env configuration file. After cloning the repository, there is a .env.example file which you need to copy first to create a .env configuration file:

```
cp .env.example .env
```

Adapt variables in the .env file.

In [29]:
if not os.path.exists(".env"):
    print("Missing configuration file")

In [25]:
from dotenv import load_dotenv
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

## HuggingFace pretrained SentenceTransformer embedding

In [27]:
embeddings = HuggingFaceEmbeddings()

2023-09-06 00:52:36.478 INFO    sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2023-09-06 00:52:38.249 INFO    sentence_transformers.SentenceTransformer: Use pytorch device: cpu


## Load PDF document

Using `PyPDF2` reader to load PDF files. 

In [37]:
pdf_path = "Gravitational_wave-1.pdf"
pdf_file_name = os.path.basename(pdf_path)
pdf_reader = PdfReader(pdf_path)
pages = pdf_reader.pages

In [14]:
len(pages)

28

## Split PDF pages into text chunks

The text is extracted from PDF pages and split into smaller chunks. The page and filename information is preserved.

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=60,
    length_function=len
)
list_of_documents = []
for i in range(0, len(pages)):
    pagenum = i+1
    page = pages[i]
    chunks = text_splitter.split_text(text=page.extract_text())
    for chunk in chunks:
        list_of_documents.append(Document(page_content=chunk, metadata=dict(page=pagenum, filename=pdf_file_name)))

In [13]:
list_of_documents[0]

Document(page_content='Simulation of the collision of two black holes. In\naddition to forming deep gravity wells and\ncoalescing into a single larger black hole,\ngravitational waves will propagate outwards as the\nblack holes spin past each other.Gravitational wave\nGravitational waves are waves of the intensity of\ngravity that are generated by the accelerated masses of\nan orbital binary system, and propagate as waves\noutward from their source at the speed of light. They', metadata={'page': 1, 'filename': 'Gravitational_wave-1.pdf'})

In [16]:
list_of_documents[1]

Document(page_content='outward from their source at the speed of light. They\nwere first proposed by Oliver Heaviside in 1893 and\nthen later by Henri Poincaré in 1905 as waves similar\nto electromagnetic waves but the gravitational\nequivalent.[1]\nGravitational waves were later predicted in 1916[2][3]\nby Albert Einstein on the basis of his general theory of\nrelativity as ripples in spacetime.[4][5] Later he refused\nto accept gravitational waves.[6] Gravitational waves', metadata={'page': 1, 'filename': 'Gravitational_wave-1.pdf'})

## Vector store

Vector stores overview:

https://python.langchain.com/docs/modules/data_connection/vectorstores/

In [17]:
VectorStore = FAISS.from_documents(embedding=embeddings, documents=list_of_documents)

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [05:32<00:00, 47.50s/it]
2023-09-06 00:43:34.449 INFO    faiss.loader: Loading faiss with AVX2 support.
2023-09-06 00:43:34.470 INFO    faiss.loader: Successfully loaded faiss with AVX2 support.


In [18]:
index_file = pdf_file_name[:-4]

In [19]:
with open(f"{index_file}.pkl", "wb") as f:
    pickle.dump(VectorStore, f)

In [None]:
if os.path.exists(f"{index_file}.pkl"):
    with open(f"{index_file}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    print('Index file loaded from disk')
else:
    print('Index file does not exist')

## Large language model (LLM)
Flan-t5-xxl: FLAN-T5 was released in the paper Scaling Instruction-Finetuned Language Models: 

https://huggingface.co/google/flan-t5-xxl

In [26]:
llm = HuggingFaceHub(
    repo_id="google/flan-t5-xxl",
    model_kwargs={"temperature": 0.5},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
)

## Question answering
Use the vector store to search documents which are similar to the query and run the question answering chain.

In [31]:
query = "Which famous scientist discovered the gravitational waves?"

In [32]:
chain = load_qa_chain(llm, chain_type="stuff")
docs = VectorStore.similarity_search(query)
answer = chain.run(input_documents=docs, question=query)

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.70it/s]


In [33]:
answer

'Einstein'

## Relevant documents

Relevant documents are listed and the page number is preserved.

In [34]:
docs

[Document(page_content='waves". JETP. 43: 605–607.\n33. Cho, Adrian (Oct. 3, 2017). "Ripples in space: U.S. trio wins physics Nobel for discovery of\ngravitational waves (https://www.science.org/content/article/ripples-space-us-trio-wins-physi\ncs-nobel-discovery-gravitational-waves)," Science. Retrieved 20 May 2019.\n34. Cervantes-Cota, Jorge L., Galindo-Uribarri, Salvador, and Smoot, George F. (2016). "A Brief\nHistory of Gravitational Waves (https://arxiv.org/abs/1609.09400)," Universe, 2, no. 3, 22.', metadata={'page': 20, 'filename': 'Gravitational_wave-1.pdf'}),
 Document(page_content='in the cosmic microwave background . However, they were later forced to retract this result.[19][20][41][42]\nIn 2017, the Nobel Prize in Physics was awarded to Rainer Weiss, Kip Thorne and Barry Barish for their\nrole in the detection of  gravitational waves.[43][44][45]', metadata={'page': 4, 'filename': 'Gravitational_wave-1.pdf'}),
 Document(page_content='gravitational waves.[24][25] When Einst