In [2]:
!pip install langchain
!pip install langchain-community
!pip install langchain-experimental
!pip install langchain-huggingface
!pip install huggingface_hub
!pip install pypdf
!pip install neo4j
!pip install openai
!pip install google-colab
!pip install json-repair



In [19]:
# from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader #to load Text

# from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter
 # to create chunks
from langchain_experimental.graph_transformers import LLMGraphTransformer
from huggingface_hub import HfApi # to get hugging face access
import os
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface import HuggingFaceEndpoint

# from langchain.memory import ChatMessageHistory
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import PyPDFLoader

In [20]:
HF_key = os.environ.get("HF_TOKEN") #set environment variable named HF_TOKEN with hugging face api key
HFapi = HfApi(HF_key)

#Embedding model
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=HF_key, model_name="sentence-transformers/all-MiniLM-l6-v2"
)

In [23]:
# text generation model or chat model
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    temperature = 0.1,
)

#transforms data into a graph structure
llm_transformer = LLMGraphTransformer(llm=llm) 

In [25]:
#loader = TextLoader("text.txt") #for a text file

from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("the_shortcut.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

#create embeddings
x = [doc.page_content for doc in documents]
document_embeddings = [embeddings.embed_documents(chunk) for chunk in x]

In [26]:
document_embeddings

[[-0.1188383623957634,
  0.048298683017492294,
  -0.002548057120293379,
  -0.011011195369064808,
  0.05195076763629913,
  0.010291754268109798,
  0.11543324589729309,
  0.0007008201209828258,
  -0.08592535555362701,
  -0.0706540048122406,
  0.0013317547272890806,
  -0.03547237440943718,
  0.018434111028909683,
  -0.006737223360687494,
  0.02440294250845909,
  -0.029503200203180313,
  -0.05813848227262497,
  -0.05043953284621239,
  -0.020765485242009163,
  0.029036033898591995,
  -0.06367593258619308,
  0.02402994967997074,
  0.02624327689409256,
  -0.006037368904799223,
  -0.011076496914029121,
  -0.0014007249847054482,
  -0.018619835376739502,
  0.03277008607983589,
  0.002885987050831318,
  -0.05694394186139107,
  -0.043941680341959,
  0.025414111092686653,
  0.08790943026542664,
  -0.024991201236844063,
  -0.03668316453695297,
  0.006241306196898222,
  -0.06646799296140671,
  -0.06714445352554321,
  0.020564226433634758,
  0.04238881543278694,
  0.021880239248275757,
  -0.0428824648

In [27]:
texts[0]

Document(page_content='The Shortcut \nAn infuential scientist in the feld of artifcial intelligence (AI) explains \nits fundamental concepts and how it is changing culture and society. \nA particular form of AI is now embedded in our tech, our infrastructure, \nand our lives. How did it get there? Where and why should we be concerned? And what should we do now? The Shortcut: Why Intelligent Machines Do Not Think Like Us provides an accessible yet \nprobing exposure of AI in its prevalent form today, proposing a new narrative to connect and make sense of events that have happened in the recent tumultuous past, and enabling us to think soberly about the road ahead. \nThis book is divided into ten carefully crafted and easily digestible \nchapters. Each chapter grapples with an important question for AI. Ranging from the scientifc concepts that underpin the technology to wider implications for society, it develops a unifed description using tools from diferent disciplines and avoiding unn

In [28]:
# The following functions are used to decode the encoded pdf files. you can ignore this part and use the above functions to create chunks with text file
def cidToChar(cidx):
    return chr(int(re.findall(r'\/g(\d+)',cidx)[0]) + 29)


def decode(sentence):
  sen = ''
  for x in sentence.split('\n'):
    if x != '' and x != '/g3':         # merely to compact the output
      abc = re.findall(r'\/g\d+',x)
      if len(abc) > 0:
          for cid in abc: x=x.replace(cid, cidToChar(cid))
      sen += repr(x).strip("'")

  return re.sub(r'\s+', ' ', sen)

def get_text_from_pdf(file):
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    for page in pages[0:30]:
        if page.page_content.count('/g') > 3:
            page.page_content = decode(page.page_content)
            print(page.page_content)
    return pages[0:30]

texts = get_text_from_pdf(f'the_shortcut.pdf')

#texts

In [29]:
from neo4j import GraphDatabase

uri=os.environ.get("neo_url")
username=os.environ.get("neo_username")
password=os.environ.get("neo_pwd")

# Create a driver instance
driver = GraphDatabase.driver(uri, auth=(username, password))

# Function to test the connection
def test_connection(driver):
    try:
        with driver.session() as session:
            result = session.run("RETURN 1")
            for record in result:
                print(record)
        print("Connection to Neo4j established successfully.")
    except Exception as e:
        print("Error connecting to Neo4j:", e)

test_connection(driver)

<Record 1=1>
Connection to Neo4j established successfully.


In [30]:
#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

In [31]:
# declare the variables to store required info to connect with neo4j db
uri=os.environ.get("neo_url")
username=os.environ.get("neo_username")
password=os.environ.get("neo_pwd")

neo4j_vector = Neo4jVector.from_documents(
    texts, embedding=embeddings, url=uri, username=username, password=password
)


#Create and update vector db using embedding_llm
#in this case there is no relationship were created, only chunks were created. to create a relationship we need to use cypher query

index_name = "vector"  # default index name

neo4j_vector = Neo4jVector.from_existing_index(
    embeddings,
    url=uri,
    username=username,
    password=password,
    index_name=index_name,
)

neo4j_vector.add_documents(texts) 



['5b2dff94ea7cb383473ac7f17c2ae9b2',
 '4943d855fbddf595c933be2d3f7eabfe',
 '978f2f68f4451d16b2cf9a60ae32339c',
 'eed71b1923b56005a05ab0b2758fd10d',
 'b06bf020f2003c2ccc9a906dbe136d60',
 'f615eb555243ea2763aa82bc02b390fa',
 '97f773614a47a07a75bcb66c2e13d806',
 '02260e85d803a8652476392b0588368a',
 '2db450c44a2695055704862cb065429c',
 'acd94c6c2f346dce62ff245e93f71200',
 'cda4ed2d8d02f07be449b771973eb8a4',
 '14d8307130b32b22bb81b08ce0a8bfba',
 'a2ea536e68a158ffaebf22c70d9be318',
 '0cb2fc3459e4c5c00e38ca930ffba2ea',
 '4c82c145735fedc0897febffc0ffc382',
 '9cf7f4ed081b3586742c98967949a35f',
 '4fdaf8c66fddd4b925f99580bafa8cb5',
 'cc6e5b2af2af36b21b6ce377ae683785',
 '2796036630b1e4eb8e40a0afa860579e',
 '27cba862e4d5eacc1e8ac29a1e87e19a',
 'e5f9c49c3742b85b737d9345fa5b0df0',
 'db2af24dbb4823f7bca03d3c5bc72c37',
 '0b8efca571a9ba7df78c3380679c7a0e',
 '32a83dc36b01c456a5905673710bca6b',
 'a1dca56ae7ac18b93df9cae1b1c7b698',
 '6ea352131d355b0d5b22b513a036d929',
 '0714b9c7e617306ca1b10a4f72bb7fcc',
 

In [32]:
#use the below lines of code to execute cypher code (using this, you can create relationships between chunks node)

#display the documents uploaded
x = neo4j_vector.query("""match (n:Chunk) return distinct n.source as source""")
x = [i['source'] for i in x]
x

['BW310.pdf', 'richard.pdf', 'the_shortcut.pdf']

In [40]:
# Check what data the chunks store
filtered_source = "the_shortcut.pdf" 
filtered_chunks = neo4j_vector.query(f"""MATCH (n:Chunk) WHERE n.source = '{filtered_source}' RETURN n""")
print(f"Chunks: {chunk['n'].keys()}")

Chunks: dict_keys(['id', 'embedding', 'text', 'source', 'page'])


In [38]:
# Access the 'text' field
for chunk in filtered_chunks:
    print(f"Chunk Content: {chunk['n'].get('text')}")

Chunk Content: The Shortcut 
An infuential scientist in the feld of artifcial intelligence (AI) explains 
its fundamental concepts and how it is changing culture and society. 
A particular form of AI is now embedded in our tech, our infrastructure, 
and our lives. How did it get there? Where and why should we be concerned? And what should we do now? The Shortcut: Why Intelligent Machines Do Not Think Like Us provides an accessible yet 
probing exposure of AI in its prevalent form today, proposing a new narrative to connect and make sense of events that have happened in the recent tumultuous past, and enabling us to think soberly about the road ahead. 
This book is divided into ten carefully crafted and easily digestible 
chapters. Each chapter grapples with an important question for AI. Ranging from the scientifc concepts that underpin the technology to wider implications for society, it develops a unifed description using tools from diferent disciplines and avoiding unnecessary abstra

In [42]:
# Check what data the chunks store
filtered_source = "the_shortcut.pdf" 
filtered_chunks = neo4j_vector.query(f"""MATCH (n:Chunk) WHERE n.source = '{filtered_source}' RETURN n""")
print(f"Chunks: {chunk['n'].keys()}")

Chunks: dict_keys(['id', 'embedding', 'text', 'source', 'page'])


In [44]:
#from google.colab import drive
#drive.mount('/content/drive')

In [46]:
#similarity search over the created vector db
from langchain.chains import RetrievalQAWithSourcesChain

greetings = ["hi", "hello", "hey", "greetings"]

query = "Who is Nello?"

if query.strip().lower() in greetings:
    print("Hi! How can I help you today?")
else:
    docs_with_score = neo4j_vector.similarity_search(query)
    retriever = neo4j_vector.as_retriever()

    #Create the Langchain Chain
    chain = RetrievalQAWithSourcesChain.from_chain_type(
        llm, chain_type="stuff", retriever=retriever
    )

    answer = chain.invoke(
        {"question": query},
        return_only_outputs=True,
    )
    
    print(answer['answer'])

 Nello is a person who was born on January 29th 1960 in El Paso Texas. He was abused by his father as a child, became fascinated with his cousin who was a green beret, and later committed petty crimes, became interested in satanism, cocaine addiction, and convictions for burglary. He moved to Los Angeles and went on to commit multiple murders.



In [47]:
docs_with_score

[Document(page_content='Early Life \nHe was born on January 29th 1960 in El Paso Texas, as a young boy he was constantly \nbeing beaten by his abusive father. He later became fascinated with his cousin mike who \nwas a green beret who’d recently returned from the Vietnam war, they would get high \ntogether and Mike would tell him the atrocities he’d committed to vietnamese women. \nLater on Mike would go on to shoot his wife in the face right in front of Richard.', metadata={'source': 'richard.pdf', 'page': 2}),
 Document(page_content='First crimes \nBy the time he was 17, he’d gone and committed some petty crimes which landed him in \njuvenile detention. He later moved to Los Angeles and picked up an interest for satanism, \ncocaine addiction and convictions for burglary.', metadata={'source': 'richard.pdf', 'page': 3}),
 Document(page_content='6 \nMICROTARGETING AND MASS \nPERSUASION \nAs we interact with online agents, we both leak and absorb a \ngreat deal of information, in a two-

In [48]:
print(docs_with_score[0].page_content)

Early Life 
He was born on January 29th 1960 in El Paso Texas, as a young boy he was constantly 
being beaten by his abusive father. He later became fascinated with his cousin mike who 
was a green beret who’d recently returned from the Vietnam war, they would get high 
together and Mike would tell him the atrocities he’d committed to vietnamese women. 
Later on Mike would go on to shoot his wife in the face right in front of Richard.


In [49]:
x = 'C:\\Users\\mm0697\\AppData\\Local\\Temp\\tmpw7xisrbe\\the_shortcut.pdf'
x.split('\\')[-1]

'the_shortcut.pdf'

In [50]:
query2 = "Who is samiya?"
dws = neo4j_vector.similarity_search(query2)

In [51]:
print(docs_with_score[0].page_content)

Early Life 
He was born on January 29th 1960 in El Paso Texas, as a young boy he was constantly 
being beaten by his abusive father. He later became fascinated with his cousin mike who 
was a green beret who’d recently returned from the Vietnam war, they would get high 
together and Mike would tell him the atrocities he’d committed to vietnamese women. 
Later on Mike would go on to shoot his wife in the face right in front of Richard.


In [56]:
ans2 = chain.invoke(
    {"question": query2},
    return_only_outputs=True,
)

In [None]:
ans2['answer']