In [2]:
!pip install langchain
!pip install langchain-community
!pip install langchain-experimental
!pip install langchain-huggingface
!pip install huggingface_hub
!pip install pypdf
!pip install neo4j
!pip install openai
!pip install google-colab
!pip install json-repair



In [3]:
# from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader #to load Text

# from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter
 # to create chunks
from langchain_experimental.graph_transformers import LLMGraphTransformer
from huggingface_hub import HfApi # to get hugging face access
import os
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface import HuggingFaceEndpoint

# from langchain.memory import ChatMessageHistory
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import PyPDFLoader

In [4]:
HF_key = os.environ.get("HF_TOKEN") #set environment variable named HF_TOKEN with hugging face api key
HFapi = HfApi(HF_key)

#Embedding model
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_key, model_name="sentence-transformers/all-MiniLM-l6-v2"
)

In [5]:
# text generation model or chat model
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    temperature = 0.1,
)

#transforms data into a graph structure
llm_transformer = LLMGraphTransformer(llm=llm) 

In [6]:
#loader = TextLoader("text.txt") #for a text file

from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("the_shortcut.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

#create embeddings
x = [doc.page_content for doc in documents]
document_embeddings = [embeddings.embed_documents(chunk) for chunk in x]

In [7]:
#print document embeddings
#document_embeddings

In [None]:
texts[0]

In [None]:
# The following functions are used to decode the encoded pdf files. 
def cidToChar(cidx):
    return chr(int(re.findall(r'\/g(\d+)',cidx)[0]) + 29)


def decode(sentence):
  sen = ''
  for x in sentence.split('\n'):
    if x != '' and x != '/g3':         # merely to compact the output
      abc = re.findall(r'\/g\d+',x)
      if len(abc) > 0:
          for cid in abc: x=x.replace(cid, cidToChar(cid))
      sen += repr(x).strip("'")

  return re.sub(r'\s+', ' ', sen)

def get_text_from_pdf(file):
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    for page in pages[0:30]:
        if page.page_content.count('/g') > 3:
            page.page_content = decode(page.page_content)
            print(page.page_content)
    return pages[0:30]

texts = get_text_from_pdf(f'the_shortcut.pdf')

#texts

In [8]:
#to check if neo4j database has been connected
from neo4j import GraphDatabase

uri=os.environ.get("neo_url")
username=os.environ.get("neo_username")
password=os.environ.get("neo_pwd")

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

# Function to test the connection
def test_connection(driver):
    try:
        with driver.session() as session:
            result = session.run("RETURN 1")
            for record in result:
                print(record)
        print("Connection to Neo4j established successfully.")
    except Exception as e:
        print("Error connecting to Neo4j:", e)

test_connection(driver)

<Record 1=1>
Connection to Neo4j established successfully.


In [13]:
# declare the variables to store required info to connect with neo4j db
uri=os.environ.get("neo_url")
username=os.environ.get("neo_username")
password=os.environ.get("neo_pwd")

neo4j_vector = Neo4jVector.from_documents(
    texts, embedding=embeddings, url=uri, username=username, password=password
)


#Create and update vector db using embedding_llm
#in this case, no relationships are created; only chunks are. To create a relationship, use cypher query. 

index_name = "vector"  # default index name

neo4j_vector = Neo4jVector.from_existing_index(
    embeddings,
    url=uri,
    username=username,
    password=password,
    index_name=index_name,
)

neo4j_vector.add_documents(texts) 

KeyError: 0

In [None]:
#execute cypher code
#display the documents uploaded
x = neo4j_vector.query("""match (n:Chunk) return distinct n.source as source""")
x = [i['source'] for i in x]
x

In [None]:
# Check what data the chunks store
filtered_source = "the_shortcut.pdf" 
filtered_chunks = neo4j_vector.query(f"""MATCH (n:Chunk) WHERE n.source = '{filtered_source}' RETURN n""")
print(f"Chunks: {chunk['n'].keys()}")

In [None]:
# Access the 'text' field
for chunk in filtered_chunks:
    print(f"Chunk Content: {chunk['n'].get('text')}")

In [None]:
# Check what data the chunks store
filtered_source = "the_shortcut.pdf" 
filtered_chunks = neo4j_vector.query(f"""MATCH (n:Chunk) WHERE n.source = '{filtered_source}' RETURN n""")
print(f"Chunks: {chunk['n'].keys()}")

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#similarity search over the created vector db
from langchain.chains import RetrievalQAWithSourcesChain

greetings = ["hi", "hello", "hey", "greetings"]

query = "What is AI?"

if query.strip().lower() in greetings:
    print("Hi! How can I help you today?")
else:
    docs_with_score = neo4j_vector.similarity_search(query)
    retriever = neo4j_vector.as_retriever()

    #Create the Langchain Chain
    chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm, chain_type="stuff", retriever=retriever
    )

    answer = chain.invoke(
        {"question": query},
        return_only_outputs=True,
    )
    
    print(answer['answer'])

In [None]:
docs_with_score

In [None]:
print(docs_with_score[0].page_content)

In [None]:
x = 'C:\\Users\\mm0697\\AppData\\Local\\Temp\\tmpw7xisrbe\\the_shortcut.pdf'
x.split('\\')[-1]

In [None]:
query2 = "Who is samiya?"
dws = neo4j_vector.similarity_search(query2)

In [None]:
print(docs_with_score[0].page_content)

In [None]:
ans2 = chain.invoke(
    {"question": query2},
    return_only_outputs=True,
)

In [None]:
ans2['answer']